How to Create a Any Page Web Scraper in Visual Basic

Introduction: Welcome to a tutorial on how to make a visual basic program which will scrape between two given points from a given page and create a list of output. Pre-Creation: My form will have: Textbox1 Extract From Textbox2 Extract To Textbox3 Page to extract from Button1 Begin extraction Steps of Creation: Step 1: First we want some imports and a function. The function will enable us to scrape the data between the two given points.
  1. Imports System.IO
  2. Imports System.Text.RegularExpressions
  3. Imports System.Net
  1. Private Function GetBetweenAll(ByVal Source As String, ByVal Str1 As String, ByVal Str2 As String) As String()
  2.         Dim Results, T As New List(Of String)
  3.         T.AddRange(Regex.Split(Source, Str1))
  4.         T.RemoveAt(0)
  5.         For Each I As String In T
  6.                 Results.Add(Regex.Split(I, Str2)(0))
  7.         Next
  8.         Return Results.ToArray
  9. End Function
Step 2: Next we want to create the code to begin the process. First we check that all forms are filled out and if they are we produce a SaveFileDialog to select a save path as .txt.
  1. If (Not TextBox1.Text = Nothing And Not TextBox2.Text = Nothing And Not TextBox3.Text = Nothing) Then
  2.     Dim fo As New SaveFileDialog
  3.     fo.Filter = "Text Files|*.txt"
  4.     fo.FilterIndex = 1
  5.     fo.Title = "Save Path"
  6.     fo.ShowDialog()
  7.     If (Not fo.FileName = Nothing) Then
  8.     End If
  9. End If
Step 3: Following initialization of the forms and save path, we get the source code of the page url, extract the data and save it to the save path. (Below is the full button code).
  1. Private Sub Button1_Click(sender As Object, e As EventArgs) Handles Button1.Click
  2.     If (Not TextBox1.Text = Nothing And Not TextBox2.Text = Nothing And Not TextBox3.Text = Nothing) Then
  3.         Dim fo As New SaveFileDialog
  4.         fo.Filter = "Text Files|*.txt"
  5.         fo.FilterIndex = 1
  6.         fo.Title = "Save Path"
  7.         fo.ShowDialog()
  8.         If (Not fo.FileName = Nothing) Then
  9.             Dim r As HttpWebRequest = HttpWebRequest.Create(TextBox3.Text)
  10.             Dim re As HttpWebResponse = r.GetResponse()
  11.             Dim src As String = New StreamReader(re.GetResponseStream()).ReadToEnd()
  12.             Dim srcs As String() = getbetweenall(src, TextBox1.Text, TextBox2.Text)
  13.             Using sw As New StreamWriter(fo.FileName)
  14.                 For Each s As String In srcs
  15.                     sw.WriteLine(s)
  16.                 Next
  17.             End Using
  18.         End If
  19.     End If
  20. End Sub
Project Complete! Below is the full source code along with download of the files.
  1. Imports System.IO
  2. Imports System.Text.RegularExpressions
  3. Imports System.Net
  4. Public Class Form1
  5.     Private Function GetBetweenAll(ByVal Source As String, ByVal Str1 As String, ByVal Str2 As String) As String()
  6.         Dim Results, T As New List(Of String)
  7.         T.AddRange(Regex.Split(Source, Str1))
  8.         T.RemoveAt(0)
  9.         For Each I As String In T
  10.             Results.Add(Regex.Split(I, Str2)(0))
  11.         Next
  12.         Return Results.ToArray
  13.     End Function
  14.     Private Sub Button1_Click(sender As Object, e As EventArgs) Handles Button1.Click
  15.         If (Not TextBox1.Text = Nothing And Not TextBox2.Text = Nothing And Not TextBox3.Text = Nothing) Then
  16.             Dim fo As New SaveFileDialog
  17.             fo.Filter = "Text Files|*.txt"
  18.             fo.FilterIndex = 1
  19.             fo.Title = "Save Path"
  20.             fo.ShowDialog()
  21.             If (Not fo.FileName = Nothing) Then
  22.                 Dim r As HttpWebRequest = HttpWebRequest.Create(TextBox3.Text)
  23.                 Dim re As HttpWebResponse = r.GetResponse()
  24.                 Dim src As String = New StreamReader(re.GetResponseStream()).ReadToEnd()
  25.                 Dim srcs As String() = getbetweenall(src, TextBox1.Text, TextBox2.Text)
  26.                 Using sw As New StreamWriter(fo.FileName)
  27.                     For Each s As String In srcs
  28.                         sw.WriteLine(s)
  29.                     Next
  30.                 End Using
  31.             End If
  32.         End If
  33.     End Sub
  34. End Class

Add new comment