1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import re
4
+
5
+ re_text = r'\:|\.|\!|(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b|(.twitter.com\/)\w*|\&'
6
+ re_text_1 = r'(pictwittercom)\/\w*'
7
+
8
+ def tweeter_scrapper ():
9
+ list_of_dirty_tweets = []
10
+ clear_list_of_tweets = []
11
+ base_tweeter_url = 'https://twitter.com/{}'
12
+
13
+ tweeter_id = input ()
14
+
15
+ response = requests .get (base_tweeter_url .format (tweeter_id ))
16
+ soup = BeautifulSoup (response .content , 'lxml' )
17
+ all_tweets = soup .find_all ('div' ,{'class' :'tweet' })
18
+
19
+ for tweet in all_tweets :
20
+ content = tweet .find ('div' ,{'class' :'content' })
21
+ message = content .find ('div' ,{'class' :'js-tweet-text-container' }).text .replace ("\n " ," " ).strip ()
22
+ list_of_dirty_tweets .append (message )
23
+ for dirty_tweet in list_of_dirty_tweets :
24
+ dirty_tweet = re .sub (re_text , '' , dirty_tweet , flags = re .MULTILINE )
25
+ dirty_tweet = re .sub (re_text_1 , '' , dirty_tweet , flags = re .MULTILINE )
26
+ dirty_tweet = dirty_tweet .replace (u'\xa0 …' , u'' )
27
+ dirty_tweet = dirty_tweet .replace (u'\xa0 ' , u'' )
28
+ dirty_tweet = dirty_tweet .replace (u'\u200c ' , u'' )
29
+ clear_list_of_tweets .append (dirty_tweet )
30
+ print (clear_list_of_tweets )
31
+
32
+
33
+
34
+ if __name__ == "__main__" :
35
+ tweeter_scrapper ()
0 commit comments