Skip to content

Commit 3bccde1

Browse files
code scraper add
1 parent 9ff4d16 commit 3bccde1

File tree

1 file changed

+35
-0
lines changed

1 file changed

+35
-0
lines changed

twitter_post_scraper.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
import re
4+
5+
re_text = r'\:|\.|\!|(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b|(.twitter.com\/)\w*|\&'
6+
re_text_1 = r'(pictwittercom)\/\w*'
7+
8+
def tweeter_scrapper():
9+
list_of_dirty_tweets = []
10+
clear_list_of_tweets = []
11+
base_tweeter_url = 'https://twitter.com/{}'
12+
13+
tweeter_id = input()
14+
15+
response = requests.get(base_tweeter_url.format(tweeter_id))
16+
soup = BeautifulSoup(response.content , 'lxml')
17+
all_tweets = soup.find_all('div',{'class':'tweet'})
18+
19+
for tweet in all_tweets:
20+
content = tweet.find('div',{'class':'content'})
21+
message = content.find('div',{'class':'js-tweet-text-container'}).text.replace("\n"," ").strip()
22+
list_of_dirty_tweets.append(message)
23+
for dirty_tweet in list_of_dirty_tweets:
24+
dirty_tweet = re.sub(re_text, '', dirty_tweet, flags=re.MULTILINE)
25+
dirty_tweet = re.sub(re_text_1, '', dirty_tweet, flags=re.MULTILINE)
26+
dirty_tweet = dirty_tweet.replace(u'\xa0…', u'')
27+
dirty_tweet = dirty_tweet.replace(u'\xa0', u'')
28+
dirty_tweet = dirty_tweet.replace(u'\u200c', u'')
29+
clear_list_of_tweets.append(dirty_tweet)
30+
print(clear_list_of_tweets)
31+
32+
33+
34+
if __name__ == "__main__":
35+
tweeter_scrapper()

0 commit comments

Comments
 (0)