Introduction:
Today we will be making a simple program to extract all emails within a web page source. We will be doing it manually rather than using Regex.
Notes:
This will require a webpage which contains one or more emails. I will be using this list for testing: http://pastebin.com/KBzSZVgh
Steps of Creation:
Step 1:
First we are going to create a new project with:
1 Text-boxes - URL
1 Button - Begin new thread (which will begin the extractor script)
1 Listbox - Contain emails
Step 2:
Next we need to Import four packages. One for creating a request and receiving a response to and from the web page, another to read the response and the final one to create a new thread and the last for Regex.
Imports System.IO
Imports System.Net
Imports System.Text.RegularExpressions
Imports System.Threading
Step 3:
The next step we want to add a function which we will use to get the text between all the HTML tags. We will be splitting the source by a space to get all the words. Then check each word if it contains appropriate signs ("@", ".") and if it does it may contain tags so we need to remove them.
Private Function GetBetweenAll(ByVal Source As String, ByVal Str1 As String, ByVal Str2 As String) As String()
Dim Results, T As New List(Of String)
T.AddRange(Regex.Split(Source, Str1))
T.RemoveAt(0)
For Each I As String In T
Results.Add(Regex.Split(I, Str2)(0))
Next
Return Results.ToArray
End Function
Step 4:
In the button one click event we are going to create a new thread which will start a function named "extract".
Private Sub Button1_Click(sender As Object, e As EventArgs) Handles Button1.Click
Dim trd As thread = New thread(AddressOf extract)
trd.isbackground = True
trd.start()
End Sub
Step 5:
Now lets check the URL in textbox1 to see if it is valid, if it is lets create a request, receive the response and read it to gain the web page source.
Private Function extract()
If (TextBox1.Text.StartsWith("http://") Or TextBox1.Text.StartsWith("https://")) Then
Dim r As HttpWebRequest = HttpWebRequest.Create(TextBox1.Text)
r.KeepAlive = True
r.UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36"
Dim re As HttpWebResponse = r.GetResponse()
Dim src As String = New StreamReader(re.GetResponseStream()).ReadToEnd()
Else : MsgBox("That is not a valid link!")
End If
End Function
Step 6:
Once we have got the source lets split it by a space to get each "word"/tag and check to see if each could be an email by looking for the "@" and "." signs.
Private Function extract()
If (TextBox1.Text.StartsWith("http://") Or TextBox1.Text.StartsWith("https://")) Then
Dim r As HttpWebRequest = HttpWebRequest.Create(TextBox1.Text)
r.KeepAlive = True
r.UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36"
Dim re As HttpWebResponse = r.GetResponse()
Dim src As String = New StreamReader(re.GetResponseStream()).ReadToEnd()
Dim words As String() = src.Split(" ")
For Each word As String In words
If (word.Contains("@") And word.Contains(".")) Then
End If
Next
Else : MsgBox("That is not a valid link!")
End If
End Function
Step 7:
Now lets check to see if that particular word contains a "" and ">" which means it could be a tag. We don't want the tags so lets get the String between ">" and "".
Private Function extract()
If (TextBox1.Text.StartsWith("http://") Or TextBox1.Text.StartsWith("https://")) Then
Dim r As HttpWebRequest = HttpWebRequest.Create(TextBox1.Text)
r.KeepAlive = True
r.UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36"
Dim re As HttpWebResponse = r.GetResponse()
Dim src As String = New StreamReader(re.GetResponseStream()).ReadToEnd()
Dim words As String() = src.Split(" ")
For Each word As String In words
If (word.Contains("@") And word.Contains(".")) Then
If (word.Contains("<") And word.Contains(">")) Then
Dim toAdd As New List(Of String)
Dim noTags As String() = GetBetweenAll(word, ">", "<")
Else
ListBox1.Items.Add(word)
End If
End If
Next
Else : MsgBox("That is not a valid link!")
End If
End Function
Step 8:
Finally, lets check how many emails there are. If there are more than one in the "toAdd" List, lets just add it to our listbox, otherwise lets iterate through them all and add them all to the listbox.
Private Function extract()
If (TextBox1.Text.StartsWith("http://") Or TextBox1.Text.StartsWith("https://")) Then
Dim r As HttpWebRequest = HttpWebRequest.Create(TextBox1.Text)
r.KeepAlive = True
r.UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36"
Dim re As HttpWebResponse = r.GetResponse()
Dim src As String = New StreamReader(re.GetResponseStream()).ReadToEnd()
Dim words As String() = src.Split(" ")
For Each word As String In words
If (word.Contains("@") And word.Contains(".")) Then
If (word.Contains("<") And word.Contains(">")) Then
Dim toAdd As New List(Of String)
Dim noTags As String() = GetBetweenAll(word, ">", "<")
For Each w As String In noTags
If (w.Contains("@") And w.Contains(".") And Not w.Contains("=")) Then
If (w.EndsWith(",") Or w.EndsWith(".")) Then
toAdd.Add(w.Substring(0, w.Length - 1))
Else
toAdd.Add(w)
End If
End If
Next
If (toAdd.Count > 0) Then
If (toAdd.Count > 1) Then
For Each t As String In toAdd
ListBox1.Items.Add(t)
Next
Else
ListBox1.Items.Add(toAdd(0))
End If
End If
Else
ListBox1.Items.Add(word)
End If
End If
Next
Else : MsgBox("That is not a valid link!")
End If
End Function
Important!
To be able to access the controls from a new thread we need to set CheckForIllegalCrossThreadCalls to False in the Form1_load event.
Remove Duplicates Function
Lets add one last feature to remove all duplicate emails. This is very simple and just iterates through each item in the listbox1 and checks against a newly created list. If it is already in the list it won't add it otherwise it will. Then it simply iterates through the new list to add them all back in to the listbox1 (After clearing it first of course!)
Of course, you could add this as a new thread as well so it doesn't temporarily crash the UI while processing.
Private Sub Button3_Click(sender As Object, e As EventArgs) Handles Button3.Click
Dim items As New List(Of String)
For Each i As String In ListBox1.Items
Dim isNew As Boolean = True
For Each it As String In items
If (it = i) Then isNew = False
Next
If (isNew) Then items.Add(i)
Next
ListBox1.Items.Clear()
For Each i As String In items
ListBox1.Items.Add(i)
Next
End Sub
Project Completed!
That's it! Here is the finished source:
Imports System.IO
Imports System.Net
Imports System.Text.RegularExpressions
Imports System.Threading
Public Class Form1
Private Function GetBetweenAll(ByVal Source As String, ByVal Str1 As String, ByVal Str2 As String) As String()
Dim Results, T As New List(Of String)
T.AddRange(Regex.Split(Source, Str1))
T.RemoveAt(0)
For Each I As String In T
Results.Add(Regex.Split(I, Str2)(0))
Next
Return Results.ToArray
End Function
Private Sub Button1_Click(sender As Object, e As EventArgs) Handles Button2.Click
Dim trd As Thread = New Thread(AddressOf extract)
trd.IsBackground = True
trd.Start()
End Sub
Private Function extract()
If (TextBox1.Text.StartsWith("http://") Or TextBox1.Text.StartsWith("https://")) Then
Dim r As HttpWebRequest = HttpWebRequest.Create(TextBox1.Text)
r.KeepAlive = True
r.UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36"
Dim re As HttpWebResponse = r.GetResponse()
Dim src As String = New StreamReader(re.GetResponseStream()).ReadToEnd()
Dim words As String() = src.Split(" ")
For Each word As String In words
If (word.Contains("@") And word.Contains(".")) Then
If (word.Contains("<") And word.Contains(">")) Then
Dim toAdd As New List(Of String)
Dim noTags As String() = GetBetweenAll(word, ">", "<")
For Each w As String In noTags
If (w.Contains("@") And w.Contains(".") And Not w.Contains("=")) Then
If (w.EndsWith(",") Or w.EndsWith(".")) Then
toAdd.Add(w.Substring(0, w.Length - 1))
Else
toAdd.Add(w)
End If
End If
Next
If (toAdd.Count > 0) Then
If (toAdd.Count > 1) Then
For Each t As String In toAdd
ListBox1.Items.Add(t)
Next
Else
ListBox1.Items.Add(toAdd(0))
End If
End If
Else
ListBox1.Items.Add(word)
End If
End If
Next
Else : MsgBox("That is not a valid link!")
End If
End Function
Private Function removeTags(ByVal w As String)
Dim toReturn As New List(Of String)
Dim noTags As String() = GetBetweenAll(w, ">", "<")
For Each word As String In noTags
If (word.Contains("@") And word.Contains(".") And Not word.Contains("=")) Then
toReturn.Add(word)
End If
Next
Return toReturn
End Function
Private Sub Form1_Load(sender As Object, e As EventArgs) Handles Me.Load
CheckForIllegalCrossThreadCalls = False
End Sub
Private Sub Button3_Click(sender As Object, e As EventArgs) Handles Button3.Click
Dim items As New List(Of String)
For Each i As String In ListBox1.Items
Dim isNew As Boolean = True
For Each it As String In items
If (it = i) Then isNew = False
Next
If (isNew) Then items.Add(i)
Next
ListBox1.Items.Clear()
For Each i As String In items
ListBox1.Items.Add(i)
Next
End Sub
End Class