Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: Updated Code to Extract Data from the Latest GeeksforGeeks Website Update #16

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 56 additions & 46 deletions modules/scrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,59 +7,68 @@ def __init__(self, username):
self.username = username

def fetchResponse(self):
BASE_URL = 'https://auth.geeksforgeeks.org/user/{}/practice/'.format(self.username)

def extract_text_from_elements(elements, element_keys):
result = {}
index = 0
for element in elements:
try:
inner_text = element.text
if inner_text == '_ _':
result[element_keys[index]] = ""
else:
result[element_keys[index]] = inner_text
except:
result[element_keys[index]] = ""
index += 1
return result
BASE_URL = 'https://www.geeksforgeeks.org/user/{}/'.format(self.username)

def extract_details(soup):
basic_details_by_index = ["institution", "languagesUsed", "campusAmbassador"]
coding_scores_by_index = ["codingScore", "totalProblemsSolved", "monthlyCodingScore", "articlesPublished"]
basic_details = soup.find_all("div", class_ = "basic_details_data")
coding_scores = soup.find_all("span", class_ = "score_card_value")

institution = soup.find("div", class_ = "educationDetails_head_left--text__tgi9I")
lang_used = soup.find("div", class_ = "educationDetails_head_right--text__lLOHI")
campusAmbas = soup.find("div", class_ = "basicUserDetails_head_CA--text__IoHEU")

score_card = soup.find_all("div", class_ = "scoreCard_head_left--score__oSi_x")

response = {}
response["basic_details"] = extract_text_from_elements(basic_details, basic_details_by_index)
response["coding_scores"] = extract_text_from_elements(coding_scores, coding_scores_by_index)
response["basic_details"] = {
basic_details_by_index[0]: institution.text if institution else '',
basic_details_by_index[1]: lang_used.text if lang_used else '',
basic_details_by_index[2]: campusAmbas.text if campusAmbas else ''
}

response["coding_scores"] = {
coding_scores_by_index[0]: score_card[0].text if score_card[0] else '',
coding_scores_by_index[1]: score_card[1].text if score_card[1] else '',
coding_scores_by_index[2]: score_card[2].text if score_card[2] and score_card[2].text != "__" else ''
}

return response

def extract_questions_by_difficulty(soup, difficulty):
try:
response = {}
questions = []
question_list_by_difficulty_tag = soup.find("div", id = difficulty.replace("#", "")).find_all("a")
response["count"] = len(question_list_by_difficulty_tag)

for question_tag in question_list_by_difficulty_tag:
question = {}
question["question"] = question_tag.text
question["questionUrl"] = question_tag["href"]
questions.append(question)

response["questions"] = questions
return response
except:
return { "count": 0, "questions": [] }

def extract_questions_solved_count(soup):
difficulties = ["#school", "#basic", "#easy", "#medium", "#hard"]

difficulties = ["school", "basic", "easy", "medium", "hard"]
result = {}

# Structure data
for difficulty in difficulties:
result[difficulty] = extract_questions_by_difficulty(soup, difficulty)

result[difficulty] = { "count": 0, "questions": []}

question_header = soup.find_all( "div", class_ = "problemNavbar_head_nav--text__UaGCx" )

for el in question_header:
match = re.search(r'([A-Za-z]+)\s*\(\s*(\d+)\s*\)', el.text)
if match:
cat_name = match.group(1).lower()
cat_count = int(match.group(2))
result[cat_name]["count"] = cat_count


response = requests.post("https://practiceapi.geeksforgeeks.org/api/v1/user/problems/submissions/", json={"handle":self.username,"requestType":"","year":"","month":""})
submission_data = response.json()

for level in submission_data['result']:
for ques in submission_data['result'][level]:
url = "https://www.geeksforgeeks.org/problems/{}/0" .format(submission_data['result'][level][ques]['slug'])
pname = submission_data['result'][level][ques]['pname']

result[level.lower()]['questions'].append({"question": pname, "questionUrl": url})

return result




profilePage = requests.get(BASE_URL)

if profilePage.status_code == 200:
Expand All @@ -70,17 +79,18 @@ def extract_questions_solved_count(soup):

generalInfo["userName"] = self.username

profile_pic = soup.find("img", class_ = "profile_pic")
institute_rank = soup.find("span", class_ = "rankNum")
streak_count = soup.find("div", class_ = "streakCnt")
profile_pic = soup.findAll("img", alt = self.username)[-1]
institute_rank = soup.find("span", class_ = "educationDetails_head_left_userRankContainer--text__wt81s")
streak_count = soup.find("div", class_ = "circularProgressBar_head_mid_streakCnt__MFOF1 tooltipped")


try:
generalInfo["profilePicture"] = profile_pic["src"]
generalInfo["profilePicture"] = "https://www.geeksforgeeks.org/" + profile_pic["src"]
except:
generalInfo["profilePicture"] = ""

try:
generalInfo["instituteRank"] = institute_rank.text
generalInfo["instituteRank"] = institute_rank.text.split(" ")[0]
except:
generalInfo["instituteRank"] = ""

Expand All @@ -101,7 +111,7 @@ def extract_questions_solved_count(soup):
generalInfo[_key] = _value

for key, value in question_count_details.items():
solvedStats[key.replace("#", "")] = value
solvedStats[key] = value

response["info"] = generalInfo
response["solvedStats"] = solvedStats
Expand Down