Visual Basic Twitter Feed Scraper
Submitted by Yorkiebar on Tuesday, September 10, 2013 - 07:18.
Language
Introduction:
Welcome to my tutorial on how to create a Twitter profile tweet scraper. First create a form which contains a textbox for the profile username and a button to begin the process.
Steps of Creation:
Step 1:
Import the following two imports so we can get the profile page source and manipulate it:
Step 2:
Now, add two functions; GetBetween and GetBetweenAll. We will be using these Regex functions to extract our tweets from our web page source.
Step 3:
On the button click event we are going to send a request to the profile page of the entered username in textbox1 and get the response (the source code once read):
Once we have read the source code of the page we are extracting all the loaded tweets using the GetBetweenAll function we already added. Then, as long as we have tweets, we are iterating through each one and writing the tweet to a text file in Current Directory > Profile Username > Tweet *TweetCount*.txt. Before we write the tweets we need to clean them of html tags...
Step 4:
Ok so now we have our tweets we need to clean them up so we aren't left with things like """ instead of a quotation mark ("). We are already running the "msg" through our clearTags function so lets create it:
Note: I might not of got all the replacements but these are the only ones I could see. If you see any more just add more replacements in the above script.
Project Complete!
Below you will find the complete source code along with a download the full project:
- Imports System.Net
- Imports System.Text.RegularExpressions
- Private Function GetBetween(ByVal Source As String, ByVal Str1 As String, ByVal Str2 As String, Optional ByVal Index As Integer = 0) As String
- Return Regex.Split(Regex.Split(Source, Str1)(Index + 1), Str2)(0)
- End Function
- Private Function GetBetweenAll(ByVal Source As String, ByVal Str1 As String, ByVal Str2 As String) As String()
- Dim Results, T As New List(Of String)
- T.AddRange(Regex.Split(Source, Str1))
- T.RemoveAt(0)
- For Each I As String In T
- Results.Add(Regex.Split(I, Str2)(0))
- Next
- Return Results.ToArray
- End Function
- Private Sub Button1_Click(sender As Object, e As EventArgs) Handles Button1.Click
- Dim r As HttpWebRequest = HttpWebRequest.Create("http://www.twitter.com/" & textbox1.text)
- Dim re As HttpWebResponse = r.GetResponse()
- Dim src As String = New System.IO.StreamReader(re.GetResponseStream()).ReadToEnd()
- If (src = Nothing) Then
- MsgBox("Error. Src is null")
- Else
- Dim tweets As String() = getbetweenall(src, "<li class=""js-stream-item stream-item stream-item expanding-stream-item"" data-item-id=""", "</div></div></li>")
- If (tweets.Count > 0) Then
- Dim tweetcount As Integer = 0
- If (Not My.Computer.FileSystem.DirectoryExists(CurDir() & "/" & TextBox1.Text)) Then My.Computer.FileSystem.CreateDirectory(CurDir() & "/" & TextBox1.Text)
- For Each tweet As String In tweets
- Using sw As New System.IO.StreamWriter(CurDir() & "/" & TextBox1.Text & "/Tweet " & tweetcount & ".txt")
- tweetcount += 1
- Dim msg As String = GetBetween(tweet, "<p class=""js-tweet-text tweet-text"">", "</p>")
- msg = clearTags(msg)
- sw.Write(msg)
- End Using
- Next
- End If
- End If
- End Sub
- Private Function clearTags(ByVal s As String)
- If (s.Contains("<") And s.Contains(">")) Then
- Dim toreturn As String = ""
- Dim shouldadd As Boolean = True
- For Each c As Char In s
- If (c = "<") Then shouldadd = False
- If (c = ">") Then shouldadd = True
- If (Not c = "<" And Not c = ">" And shouldadd) Then
- toreturn &= c
- End If
- Next
- If (toreturn.Contains("'")) Then
- toreturn = toreturn.Replace("'", "'")
- End If
- If (toreturn.Contains(" ")) Then
- toreturn = toreturn.Replace(" ", " ")
- End If
- If (toreturn.Contains(""")) Then
- toreturn = toreturn.Replace(""", """")
- End If
- Return toreturn
- Else
- Dim s2 As String = ""
- If (s2.Contains("'")) Then
- s2 = s2.Replace("'", "'")
- End If
- If (s2.Contains(" ")) Then
- s2 = s2.Replace(" ", " ")
- End If
- If (s2.Contains(""")) Then
- s2 = s2.Replace(""", """")
- End If
- Return s2
- End If
- End Function
- Imports System.Net
- Imports System.Text.RegularExpressions
- Public Class Form1
- Private Function GetBetween(ByVal Source As String, ByVal Str1 As String, ByVal Str2 As String, Optional ByVal Index As Integer = 0) As String
- Return Regex.Split(Regex.Split(Source, Str1)(Index + 1), Str2)(0)
- End Function
- Private Function GetBetweenAll(ByVal Source As String, ByVal Str1 As String, ByVal Str2 As String) As String()
- Dim Results, T As New List(Of String)
- T.AddRange(Regex.Split(Source, Str1))
- T.RemoveAt(0)
- For Each I As String In T
- Results.Add(Regex.Split(I, Str2)(0))
- Next
- Return Results.ToArray
- End Function
- Private Sub Button1_Click(sender As Object, e As EventArgs) Handles Button1.Click
- Dim r As HttpWebRequest = HttpWebRequest.Create("http://www.twitter.com/" & textbox1.text)
- Dim re As HttpWebResponse = r.GetResponse()
- Dim src As String = New System.IO.StreamReader(re.GetResponseStream()).ReadToEnd()
- If (src = Nothing) Then
- MsgBox("Error. Src is null")
- Else
- Dim tweets As String() = getbetweenall(src, "<li class=""js-stream-item stream-item stream-item expanding-stream-item"" data-item-id=""", "</div></div></li>")
- If (tweets.Count > 0) Then
- Dim tweetcount As Integer = 0
- If (Not My.Computer.FileSystem.DirectoryExists(CurDir() & "/" & TextBox1.Text)) Then My.Computer.FileSystem.CreateDirectory(CurDir() & "/" & TextBox1.Text)
- For Each tweet As String In tweets
- Using sw As New System.IO.StreamWriter(CurDir() & "/" & TextBox1.Text & "/Tweet " & tweetcount & ".txt")
- tweetcount += 1
- Dim msg As String = GetBetween(tweet, "<p class=""js-tweet-text tweet-text"">", "</p>")
- msg = clearTags(msg)
- sw.Write(msg)
- End Using
- Next
- End If
- End If
- End Sub
- Private Function clearTags(ByVal s As String)
- If (s.Contains("<") And s.Contains(">")) Then
- Dim toreturn As String = ""
- Dim shouldadd As Boolean = True
- For Each c As Char In s
- If (c = "<") Then shouldadd = False
- If (c = ">") Then shouldadd = True
- If (Not c = "<" And Not c = ">" And shouldadd) Then
- toreturn &= c
- End If
- Next
- If (toreturn.Contains("'")) Then
- toreturn = toreturn.Replace("'", "'")
- End If
- If (toreturn.Contains(" ")) Then
- toreturn = toreturn.Replace(" ", " ")
- End If
- If (toreturn.Contains(""")) Then
- toreturn = toreturn.Replace(""", """")
- End If
- Return toreturn
- Else
- Dim s2 As String = ""
- If (s2.Contains("'")) Then
- s2 = s2.Replace("'", "'")
- End If
- If (s2.Contains(" ")) Then
- s2 = s2.Replace(" ", " ")
- End If
- If (s2.Contains(""")) Then
- s2 = s2.Replace(""", """")
- End If
- Return s2
- End If
- End Function
- End Class
Note: Due to the size or complexity of this submission, the author has submitted it as a .zip file to shorten your download time. After downloading it, you will need a program like Winzip to decompress it.
Virus note: All files are scanned once-a-day by SourceCodester.com for viruses, but new viruses come out every day, so no prevention program can catch 100% of them.
FOR YOUR OWN SAFETY, PLEASE:
1. Re-scan downloaded files using your personal virus checker before using it.
2. NEVER, EVER run compiled files (.exe's, .ocx's, .dll's etc.)--only run source code.
Add new comment
- 130 views