|
1 | 1 | #!/usr/bin/env python3 |
2 | 2 | """Parse Leetcode/Lintcode html page to markdown.""" |
3 | 3 |
|
4 | | -import sys |
5 | | -from pyquery import PyQuery |
| 4 | +import frontmatter |
6 | 5 | import requests |
7 | 6 | import html2text |
8 | 7 |
|
9 | 8 |
|
10 | | -class OJHtml2Markdown(object): |
11 | | - """Parse Leetcode/Lintcode html page to markdown.""" |
12 | | - |
13 | | - def __init__(self, url, prefer_leetcode=False): |
14 | | - """Init.""" |
15 | | - self._prefer_leetcode = prefer_leetcode |
16 | | - url = url.strip().rstrip('/').replace('/zh-cn/', '/en/') |
17 | | - key_end = url.find('.com/') |
18 | | - self._site = url[key_end - 8:key_end] |
19 | | - self._url = url |
20 | | - self._raw_p_html = PyQuery(url=url) |
21 | | - self._p_url_path = url.split('/')[-1] |
22 | | - self._p_urls = {} |
23 | | - |
24 | | - def _lint2leet(self): |
25 | | - """Replace lintcode with leetcode if prefer leetcode.""" |
26 | | - if self._url.startswith('https://leetcode.com/problems/'): |
27 | | - return |
28 | | - url = 'https://leetcode.com/problems/{}/'.format(self._p_url_path) |
| 9 | +class YamlContent(object): |
| 10 | + def __init__(self, metadata, content): |
| 11 | + self.metadata_ = metadata |
| 12 | + self.content_ = content |
| 13 | + |
| 14 | + @property |
| 15 | + def metadata(self): |
| 16 | + return self.metadata_ |
| 17 | + |
| 18 | + @property |
| 19 | + def content(self): |
| 20 | + return self.content_ |
| 21 | + |
| 22 | + content = '# ' + title |
| 23 | + yaml_content = YamlContent(metadata, content) |
| 24 | + |
| 25 | + |
| 26 | +def leet_lint_url(url): |
| 27 | + problem_slug = url.strip('/')[-1] |
| 28 | + leetcode_url = 'https://leetcode.com/problems/{}/'.format(problem_slug) |
| 29 | + lintcode_url = 'http://www.lintcode.com/en/problem/{}/'.format(problem_slug) |
| 30 | + urls = {} |
| 31 | + for url in [leetcode_url, lintcode_url]: |
29 | 32 | response = requests.head(url) |
30 | 33 | if response.status_code == 200: |
31 | | - self._site = 'leetcode' |
32 | | - self._url = url |
33 | | - self._raw_p_html = PyQuery(url=self._url) |
34 | | - |
35 | | - def _gen_p_url_lists(self): |
36 | | - """Generate leetcode/lintcode problem url lists.""" |
37 | | - leetcode_url = 'https://leetcode.com/problems/{}/'.format(self._p_url_path) |
38 | | - lintcode_url = 'http://www.lintcode.com/en/problem/{}/'.format(self._p_url_path) |
39 | | - for url in [leetcode_url, lintcode_url]: |
40 | | - response = requests.head(url) |
41 | | - if response.status_code == 200: |
42 | | - key_end = url.find('.com/') |
43 | | - site = url[key_end - 8:key_end] |
44 | | - self._p_urls[site] = url |
45 | | - p_title = self._get_p_title() |
46 | | - p_url_lists = [] |
47 | | - for site in sorted(self._p_urls): |
48 | | - p_list = '- {site}: [{title}]({url})'.format( |
49 | | - site=site, title=p_title, url=self._p_urls[site]) |
50 | | - p_url_lists.append(p_list) |
51 | | - return p_url_lists |
52 | | - |
53 | | - def _get_p_title(self): |
54 | | - """Get problem title.""" |
55 | | - p_title = self._raw_p_html('title').text().split('|')[0].strip() |
56 | | - return p_title |
57 | | - |
58 | | - def _run_method(self, method): |
59 | | - return getattr(self, '{}{}'.format( |
60 | | - method, |
61 | | - self._site))() |
62 | | - |
63 | | - def _get_p_html_body_leetcode(self): |
64 | | - """Get problem html body only.""" |
65 | | - q_content_html = self._raw_p_html('.question-content').html() |
66 | | - p_body_start = q_content_html.find('<p>') |
67 | | - p_body_end = q_content_html.find('<div>') |
68 | | - p_body = q_content_html[p_body_start:p_body_end] |
69 | | - return p_body |
70 | | - |
71 | | - def _get_p_html_body_lintcode(self): |
72 | | - q_content_html = self._raw_p_html('#description').html() |
73 | | - p_body_end = q_content_html.find('<b>Tags</b>') |
74 | | - p_body = q_content_html[:p_body_end] |
75 | | - return p_body |
76 | | - |
77 | | - def _get_p_tags_leetcode(self): |
78 | | - p_tags = [] |
79 | | - try: |
80 | | - raw_tags = self._raw_p_html('.btn.btn-xs.btn-primary') |
81 | | - for tag in raw_tags: |
82 | | - if tag.attrib['href'].startswith('/tag/'): |
83 | | - p_tags.append(tag.text) |
84 | | - except Exception as err: |
85 | | - print('Error: ', err) |
86 | | - return p_tags |
87 | | - |
88 | | - def _get_p_tags_lintcode(self): |
89 | | - p_tags = [] |
90 | | - try: |
91 | | - raw_tags = self._raw_p_html('#description')('#tags')('a') |
92 | | - p_tags = [tag.text for tag in raw_tags] |
93 | | - except Exception as err: |
94 | | - print('Error: ', err) |
95 | | - return p_tags |
96 | | - |
97 | | - def _get_p_difficulty_leetcode(self): |
98 | | - difficulty_info = self._raw_p_html('.question-info.text-info') |
99 | | - return difficulty_info.text().split(' ')[-1] |
100 | | - |
101 | | - def _get_p_difficulty_lintcode(self): |
102 | | - raw_d_info = self._raw_p_html('.progress.progress-xs.m-b').html() |
103 | | - d_info = raw_d_info.split('"Difficulty')[1].strip().split(' ')[0] |
104 | | - return d_info |
105 | | - |
106 | | - def gen_markdown(self): |
107 | | - """Generate markdown with problem html.""" |
108 | | - h = html2text.HTML2Text() |
109 | | - if self._prefer_leetcode: |
110 | | - self._lint2leet() |
111 | | - p_title = self._get_p_title() |
112 | | - p_body = self._run_method('_get_p_html_body_') |
113 | | - p_difficulty = self._run_method('_get_p_difficulty_') |
114 | | - raw_p_tags = self._run_method('_get_p_tags_') |
115 | | - raw_p_tags.append(p_difficulty) |
116 | | - p_tags = ['TAG_' + tag.replace(' ', '_') for tag in raw_p_tags] |
117 | | - # markdown output |
118 | | - lines = [] |
119 | | - lines.append('# {}\n'.format(p_title)) |
120 | | - tags = ' '.join(p_tags) |
121 | | - lines.append('**TAGS:** {}\n'.format(tags)) |
122 | | - lines.append('## Question\n') |
123 | | - p_url_lists = self._gen_p_url_lists() |
124 | | - lines.extend(p_url_lists) |
125 | | - lines.append('\n### Problem Statement\n') |
126 | | - lines.append(h.handle(p_body)) |
127 | | - print('\n'.join(lines)) |
128 | | - |
129 | | - |
130 | | -def main(argv): |
131 | | - """Parse from html to markdown.""" |
132 | | - if (len(argv) == 2): |
133 | | - scripts, url = argv |
134 | | - prefer_leetcode = False |
135 | | - elif (len(argv) == 3): |
136 | | - scripts, url, prefer_leetcode = argv |
137 | | - else: |
138 | | - print("Usage: python ojhtml2markdown.py problem_url [prefer_leetcode]") |
139 | | - sys.exit(1) |
140 | | - ojhtml2markdown = OJHtml2Markdown(url, prefer_leetcode) |
141 | | - ojhtml2markdown.gen_markdown() |
142 | | - |
143 | | -if __name__ == "__main__": |
144 | | - main(sys.argv) |
| 34 | + if url.startswith('https://leetcode'): |
| 35 | + urls['leetcode'] = url |
| 36 | + elif url.startswith('http://www.lintcode'): |
| 37 | + urls['lintcode'] = url |
| 38 | + return urls |
| 39 | + |
| 40 | + |
| 41 | +def problem2md(problem): |
| 42 | + metadata = { |
| 43 | + 'title': problem['title'], |
| 44 | + 'difficulty': problem['difficulty'] |
| 45 | + } |
| 46 | + if problem['tags']: |
| 47 | + metadata['tags'] = problem['tags'] |
| 48 | + |
| 49 | + description = problem['description'] |
| 50 | + h = html2text.HTML2Text() |
| 51 | + description_md = h.handle(description) |
| 52 | + |
| 53 | + lines = [] |
| 54 | + lines.append('# ' + problem['title'] + '\n') |
| 55 | + lines.append('## Problem\n') |
| 56 | + lines.append('### Metadata\n') |
| 57 | + if problem['tags']: |
| 58 | + lines.append('- tags: ' + ', '.join(problem['tags'])) |
| 59 | + lines.append('- difficulty: ' + problem['difficulty']) |
| 60 | + urls = leet_lint_url(problem['url']) |
| 61 | + for k, v in urls.items(): |
| 62 | + lines.append('- source({}): <{}>'.format(k, v)) |
| 63 | + lines.append('\n### Description\n') |
| 64 | + lines.append(description_md) |
| 65 | + |
| 66 | + content = '\n'.join(lines) |
| 67 | + yaml_content = YamlContent(metadata, content) |
| 68 | + problem_md = frontmatter.dumps(yaml_content, allow_unicode=True) |
| 69 | + return problem_md |
0 commit comments