Skip to content

Commit ace5f4b

Browse files
committed
convert from html to markdown
1 parent 02cd10d commit ace5f4b

File tree

4 files changed

+80
-136
lines changed

4 files changed

+80
-136
lines changed

scripts/leetcode.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,12 @@ def get_tags(self):
4747
tags.append(tag)
4848
return tags
4949

50+
def _clean_url(self, url):
51+
new_url = ['https:/', 'leetcode.com', 'problems']
52+
problem_slug = url[len('https://'):].strip('/').split('/')[2]
53+
new_url.append(problem_slug)
54+
return '/'.join(new_url)
55+
5056
def get_problem_all(self, url):
5157
"""获取所有细节"""
5258
print('get all the problem detail...')
@@ -59,7 +65,8 @@ def get_problem_all(self, url):
5965
'title': title,
6066
'difficulty': difficulty,
6167
'tags': tags,
62-
'description': description
68+
'description': description,
69+
'url': self._clean_url(url)
6370
}
6471
self.teardown()
6572
return problem

scripts/main.py

+8
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
from datetime import datetime
77

88
from util import par_dir, mkdir_p
9+
from leetcode import Leetcode
10+
from ojhtml2markdown import problem2md
911

1012
BASEDIR = os.path.abspath(os.path.dirname(__file__))
1113

@@ -28,3 +30,9 @@ def curr_time():
2830
print('Called with arguments: {}'.format(args))
2931

3032
ROOTDIR = par_dir(BASEDIR)
33+
raw_url = args.new
34+
if raw_url.startswith('https://leetcode'):
35+
leetcode = Leetcode()
36+
problem = leetcode.get_problem_all(raw_url)
37+
problem_md = problem2md(problem)
38+
print(problem_md)

scripts/ojhtml2markdown.py

100755100644
+60-135
Original file line numberDiff line numberDiff line change
@@ -1,144 +1,69 @@
11
#!/usr/bin/env python3
22
"""Parse Leetcode/Lintcode html page to markdown."""
33

4-
import sys
5-
from pyquery import PyQuery
4+
import frontmatter
65
import requests
76
import html2text
87

98

10-
class OJHtml2Markdown(object):
11-
"""Parse Leetcode/Lintcode html page to markdown."""
12-
13-
def __init__(self, url, prefer_leetcode=False):
14-
"""Init."""
15-
self._prefer_leetcode = prefer_leetcode
16-
url = url.strip().rstrip('/').replace('/zh-cn/', '/en/')
17-
key_end = url.find('.com/')
18-
self._site = url[key_end - 8:key_end]
19-
self._url = url
20-
self._raw_p_html = PyQuery(url=url)
21-
self._p_url_path = url.split('/')[-1]
22-
self._p_urls = {}
23-
24-
def _lint2leet(self):
25-
"""Replace lintcode with leetcode if prefer leetcode."""
26-
if self._url.startswith('https://leetcode.com/problems/'):
27-
return
28-
url = 'https://leetcode.com/problems/{}/'.format(self._p_url_path)
9+
class YamlContent(object):
10+
def __init__(self, metadata, content):
11+
self.metadata_ = metadata
12+
self.content_ = content
13+
14+
@property
15+
def metadata(self):
16+
return self.metadata_
17+
18+
@property
19+
def content(self):
20+
return self.content_
21+
22+
content = '# ' + title
23+
yaml_content = YamlContent(metadata, content)
24+
25+
26+
def leet_lint_url(url):
27+
problem_slug = url.strip('/')[-1]
28+
leetcode_url = 'https://leetcode.com/problems/{}/'.format(problem_slug)
29+
lintcode_url = 'http://www.lintcode.com/en/problem/{}/'.format(problem_slug)
30+
urls = {}
31+
for url in [leetcode_url, lintcode_url]:
2932
response = requests.head(url)
3033
if response.status_code == 200:
31-
self._site = 'leetcode'
32-
self._url = url
33-
self._raw_p_html = PyQuery(url=self._url)
34-
35-
def _gen_p_url_lists(self):
36-
"""Generate leetcode/lintcode problem url lists."""
37-
leetcode_url = 'https://leetcode.com/problems/{}/'.format(self._p_url_path)
38-
lintcode_url = 'http://www.lintcode.com/en/problem/{}/'.format(self._p_url_path)
39-
for url in [leetcode_url, lintcode_url]:
40-
response = requests.head(url)
41-
if response.status_code == 200:
42-
key_end = url.find('.com/')
43-
site = url[key_end - 8:key_end]
44-
self._p_urls[site] = url
45-
p_title = self._get_p_title()
46-
p_url_lists = []
47-
for site in sorted(self._p_urls):
48-
p_list = '- {site}: [{title}]({url})'.format(
49-
site=site, title=p_title, url=self._p_urls[site])
50-
p_url_lists.append(p_list)
51-
return p_url_lists
52-
53-
def _get_p_title(self):
54-
"""Get problem title."""
55-
p_title = self._raw_p_html('title').text().split('|')[0].strip()
56-
return p_title
57-
58-
def _run_method(self, method):
59-
return getattr(self, '{}{}'.format(
60-
method,
61-
self._site))()
62-
63-
def _get_p_html_body_leetcode(self):
64-
"""Get problem html body only."""
65-
q_content_html = self._raw_p_html('.question-content').html()
66-
p_body_start = q_content_html.find('<p>')
67-
p_body_end = q_content_html.find('<div>')
68-
p_body = q_content_html[p_body_start:p_body_end]
69-
return p_body
70-
71-
def _get_p_html_body_lintcode(self):
72-
q_content_html = self._raw_p_html('#description').html()
73-
p_body_end = q_content_html.find('<b>Tags</b>')
74-
p_body = q_content_html[:p_body_end]
75-
return p_body
76-
77-
def _get_p_tags_leetcode(self):
78-
p_tags = []
79-
try:
80-
raw_tags = self._raw_p_html('.btn.btn-xs.btn-primary')
81-
for tag in raw_tags:
82-
if tag.attrib['href'].startswith('/tag/'):
83-
p_tags.append(tag.text)
84-
except Exception as err:
85-
print('Error: ', err)
86-
return p_tags
87-
88-
def _get_p_tags_lintcode(self):
89-
p_tags = []
90-
try:
91-
raw_tags = self._raw_p_html('#description')('#tags')('a')
92-
p_tags = [tag.text for tag in raw_tags]
93-
except Exception as err:
94-
print('Error: ', err)
95-
return p_tags
96-
97-
def _get_p_difficulty_leetcode(self):
98-
difficulty_info = self._raw_p_html('.question-info.text-info')
99-
return difficulty_info.text().split(' ')[-1]
100-
101-
def _get_p_difficulty_lintcode(self):
102-
raw_d_info = self._raw_p_html('.progress.progress-xs.m-b').html()
103-
d_info = raw_d_info.split('"Difficulty')[1].strip().split(' ')[0]
104-
return d_info
105-
106-
def gen_markdown(self):
107-
"""Generate markdown with problem html."""
108-
h = html2text.HTML2Text()
109-
if self._prefer_leetcode:
110-
self._lint2leet()
111-
p_title = self._get_p_title()
112-
p_body = self._run_method('_get_p_html_body_')
113-
p_difficulty = self._run_method('_get_p_difficulty_')
114-
raw_p_tags = self._run_method('_get_p_tags_')
115-
raw_p_tags.append(p_difficulty)
116-
p_tags = ['TAG_' + tag.replace(' ', '_') for tag in raw_p_tags]
117-
# markdown output
118-
lines = []
119-
lines.append('# {}\n'.format(p_title))
120-
tags = ' '.join(p_tags)
121-
lines.append('**TAGS:** {}\n'.format(tags))
122-
lines.append('## Question\n')
123-
p_url_lists = self._gen_p_url_lists()
124-
lines.extend(p_url_lists)
125-
lines.append('\n### Problem Statement\n')
126-
lines.append(h.handle(p_body))
127-
print('\n'.join(lines))
128-
129-
130-
def main(argv):
131-
"""Parse from html to markdown."""
132-
if (len(argv) == 2):
133-
scripts, url = argv
134-
prefer_leetcode = False
135-
elif (len(argv) == 3):
136-
scripts, url, prefer_leetcode = argv
137-
else:
138-
print("Usage: python ojhtml2markdown.py problem_url [prefer_leetcode]")
139-
sys.exit(1)
140-
ojhtml2markdown = OJHtml2Markdown(url, prefer_leetcode)
141-
ojhtml2markdown.gen_markdown()
142-
143-
if __name__ == "__main__":
144-
main(sys.argv)
34+
if url.startswith('https://leetcode'):
35+
urls['leetcode'] = url
36+
elif url.startswith('http://www.lintcode'):
37+
urls['lintcode'] = url
38+
return urls
39+
40+
41+
def problem2md(problem):
42+
metadata = {
43+
'title': problem['title'],
44+
'difficulty': problem['difficulty']
45+
}
46+
if problem['tags']:
47+
metadata['tags'] = problem['tags']
48+
49+
description = problem['description']
50+
h = html2text.HTML2Text()
51+
description_md = h.handle(description)
52+
53+
lines = []
54+
lines.append('# ' + problem['title'] + '\n')
55+
lines.append('## Problem\n')
56+
lines.append('### Metadata\n')
57+
if problem['tags']:
58+
lines.append('- tags: ' + ', '.join(problem['tags']))
59+
lines.append('- difficulty: ' + problem['difficulty'])
60+
urls = leet_lint_url(problem['url'])
61+
for k, v in urls.items():
62+
lines.append('- source({}): <{}>'.format(k, v))
63+
lines.append('\n### Description\n')
64+
lines.append(description_md)
65+
66+
content = '\n'.join(lines)
67+
yaml_content = YamlContent(metadata, content)
68+
problem_md = frontmatter.dumps(yaml_content, allow_unicode=True)
69+
return problem_md

scripts/requirements.txt

+4
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,9 @@ html2text==2018.1.9
55
idna==2.6
66
lxml==4.1.1
77
pyquery==1.4.0
8+
python-frontmatter==0.4.2
9+
PyYAML==3.12
810
requests==2.18.4
11+
selenium==3.10.0
12+
six==1.11.0
913
urllib3==1.22

0 commit comments

Comments
 (0)