|
1 | 1 | #!/usr/bin/env python3
|
2 | 2 | """Parse Leetcode/Lintcode html page to markdown."""
|
3 | 3 |
|
4 |
| -import sys |
5 |
| -from pyquery import PyQuery |
| 4 | +import frontmatter |
6 | 5 | import requests
|
7 | 6 | import html2text
|
8 | 7 |
|
9 | 8 |
|
10 |
| -class OJHtml2Markdown(object): |
11 |
| - """Parse Leetcode/Lintcode html page to markdown.""" |
12 |
| - |
13 |
| - def __init__(self, url, prefer_leetcode=False): |
14 |
| - """Init.""" |
15 |
| - self._prefer_leetcode = prefer_leetcode |
16 |
| - url = url.strip().rstrip('/').replace('/zh-cn/', '/en/') |
17 |
| - key_end = url.find('.com/') |
18 |
| - self._site = url[key_end - 8:key_end] |
19 |
| - self._url = url |
20 |
| - self._raw_p_html = PyQuery(url=url) |
21 |
| - self._p_url_path = url.split('/')[-1] |
22 |
| - self._p_urls = {} |
23 |
| - |
24 |
| - def _lint2leet(self): |
25 |
| - """Replace lintcode with leetcode if prefer leetcode.""" |
26 |
| - if self._url.startswith('https://leetcode.com/problems/'): |
27 |
| - return |
28 |
| - url = 'https://leetcode.com/problems/{}/'.format(self._p_url_path) |
| 9 | +class YamlContent(object): |
| 10 | + def __init__(self, metadata, content): |
| 11 | + self.metadata_ = metadata |
| 12 | + self.content_ = content |
| 13 | + |
| 14 | + @property |
| 15 | + def metadata(self): |
| 16 | + return self.metadata_ |
| 17 | + |
| 18 | + @property |
| 19 | + def content(self): |
| 20 | + return self.content_ |
| 21 | + |
| 22 | + content = '# ' + title |
| 23 | + yaml_content = YamlContent(metadata, content) |
| 24 | + |
| 25 | + |
| 26 | +def leet_lint_url(url): |
| 27 | + problem_slug = url.strip('/')[-1] |
| 28 | + leetcode_url = 'https://leetcode.com/problems/{}/'.format(problem_slug) |
| 29 | + lintcode_url = 'http://www.lintcode.com/en/problem/{}/'.format(problem_slug) |
| 30 | + urls = {} |
| 31 | + for url in [leetcode_url, lintcode_url]: |
29 | 32 | response = requests.head(url)
|
30 | 33 | if response.status_code == 200:
|
31 |
| - self._site = 'leetcode' |
32 |
| - self._url = url |
33 |
| - self._raw_p_html = PyQuery(url=self._url) |
34 |
| - |
35 |
| - def _gen_p_url_lists(self): |
36 |
| - """Generate leetcode/lintcode problem url lists.""" |
37 |
| - leetcode_url = 'https://leetcode.com/problems/{}/'.format(self._p_url_path) |
38 |
| - lintcode_url = 'http://www.lintcode.com/en/problem/{}/'.format(self._p_url_path) |
39 |
| - for url in [leetcode_url, lintcode_url]: |
40 |
| - response = requests.head(url) |
41 |
| - if response.status_code == 200: |
42 |
| - key_end = url.find('.com/') |
43 |
| - site = url[key_end - 8:key_end] |
44 |
| - self._p_urls[site] = url |
45 |
| - p_title = self._get_p_title() |
46 |
| - p_url_lists = [] |
47 |
| - for site in sorted(self._p_urls): |
48 |
| - p_list = '- {site}: [{title}]({url})'.format( |
49 |
| - site=site, title=p_title, url=self._p_urls[site]) |
50 |
| - p_url_lists.append(p_list) |
51 |
| - return p_url_lists |
52 |
| - |
53 |
| - def _get_p_title(self): |
54 |
| - """Get problem title.""" |
55 |
| - p_title = self._raw_p_html('title').text().split('|')[0].strip() |
56 |
| - return p_title |
57 |
| - |
58 |
| - def _run_method(self, method): |
59 |
| - return getattr(self, '{}{}'.format( |
60 |
| - method, |
61 |
| - self._site))() |
62 |
| - |
63 |
| - def _get_p_html_body_leetcode(self): |
64 |
| - """Get problem html body only.""" |
65 |
| - q_content_html = self._raw_p_html('.question-content').html() |
66 |
| - p_body_start = q_content_html.find('<p>') |
67 |
| - p_body_end = q_content_html.find('<div>') |
68 |
| - p_body = q_content_html[p_body_start:p_body_end] |
69 |
| - return p_body |
70 |
| - |
71 |
| - def _get_p_html_body_lintcode(self): |
72 |
| - q_content_html = self._raw_p_html('#description').html() |
73 |
| - p_body_end = q_content_html.find('<b>Tags</b>') |
74 |
| - p_body = q_content_html[:p_body_end] |
75 |
| - return p_body |
76 |
| - |
77 |
| - def _get_p_tags_leetcode(self): |
78 |
| - p_tags = [] |
79 |
| - try: |
80 |
| - raw_tags = self._raw_p_html('.btn.btn-xs.btn-primary') |
81 |
| - for tag in raw_tags: |
82 |
| - if tag.attrib['href'].startswith('/tag/'): |
83 |
| - p_tags.append(tag.text) |
84 |
| - except Exception as err: |
85 |
| - print('Error: ', err) |
86 |
| - return p_tags |
87 |
| - |
88 |
| - def _get_p_tags_lintcode(self): |
89 |
| - p_tags = [] |
90 |
| - try: |
91 |
| - raw_tags = self._raw_p_html('#description')('#tags')('a') |
92 |
| - p_tags = [tag.text for tag in raw_tags] |
93 |
| - except Exception as err: |
94 |
| - print('Error: ', err) |
95 |
| - return p_tags |
96 |
| - |
97 |
| - def _get_p_difficulty_leetcode(self): |
98 |
| - difficulty_info = self._raw_p_html('.question-info.text-info') |
99 |
| - return difficulty_info.text().split(' ')[-1] |
100 |
| - |
101 |
| - def _get_p_difficulty_lintcode(self): |
102 |
| - raw_d_info = self._raw_p_html('.progress.progress-xs.m-b').html() |
103 |
| - d_info = raw_d_info.split('"Difficulty')[1].strip().split(' ')[0] |
104 |
| - return d_info |
105 |
| - |
106 |
| - def gen_markdown(self): |
107 |
| - """Generate markdown with problem html.""" |
108 |
| - h = html2text.HTML2Text() |
109 |
| - if self._prefer_leetcode: |
110 |
| - self._lint2leet() |
111 |
| - p_title = self._get_p_title() |
112 |
| - p_body = self._run_method('_get_p_html_body_') |
113 |
| - p_difficulty = self._run_method('_get_p_difficulty_') |
114 |
| - raw_p_tags = self._run_method('_get_p_tags_') |
115 |
| - raw_p_tags.append(p_difficulty) |
116 |
| - p_tags = ['TAG_' + tag.replace(' ', '_') for tag in raw_p_tags] |
117 |
| - # markdown output |
118 |
| - lines = [] |
119 |
| - lines.append('# {}\n'.format(p_title)) |
120 |
| - tags = ' '.join(p_tags) |
121 |
| - lines.append('**TAGS:** {}\n'.format(tags)) |
122 |
| - lines.append('## Question\n') |
123 |
| - p_url_lists = self._gen_p_url_lists() |
124 |
| - lines.extend(p_url_lists) |
125 |
| - lines.append('\n### Problem Statement\n') |
126 |
| - lines.append(h.handle(p_body)) |
127 |
| - print('\n'.join(lines)) |
128 |
| - |
129 |
| - |
130 |
| -def main(argv): |
131 |
| - """Parse from html to markdown.""" |
132 |
| - if (len(argv) == 2): |
133 |
| - scripts, url = argv |
134 |
| - prefer_leetcode = False |
135 |
| - elif (len(argv) == 3): |
136 |
| - scripts, url, prefer_leetcode = argv |
137 |
| - else: |
138 |
| - print("Usage: python ojhtml2markdown.py problem_url [prefer_leetcode]") |
139 |
| - sys.exit(1) |
140 |
| - ojhtml2markdown = OJHtml2Markdown(url, prefer_leetcode) |
141 |
| - ojhtml2markdown.gen_markdown() |
142 |
| - |
143 |
| -if __name__ == "__main__": |
144 |
| - main(sys.argv) |
| 34 | + if url.startswith('https://leetcode'): |
| 35 | + urls['leetcode'] = url |
| 36 | + elif url.startswith('http://www.lintcode'): |
| 37 | + urls['lintcode'] = url |
| 38 | + return urls |
| 39 | + |
| 40 | + |
| 41 | +def problem2md(problem): |
| 42 | + metadata = { |
| 43 | + 'title': problem['title'], |
| 44 | + 'difficulty': problem['difficulty'] |
| 45 | + } |
| 46 | + if problem['tags']: |
| 47 | + metadata['tags'] = problem['tags'] |
| 48 | + |
| 49 | + description = problem['description'] |
| 50 | + h = html2text.HTML2Text() |
| 51 | + description_md = h.handle(description) |
| 52 | + |
| 53 | + lines = [] |
| 54 | + lines.append('# ' + problem['title'] + '\n') |
| 55 | + lines.append('## Problem\n') |
| 56 | + lines.append('### Metadata\n') |
| 57 | + if problem['tags']: |
| 58 | + lines.append('- tags: ' + ', '.join(problem['tags'])) |
| 59 | + lines.append('- difficulty: ' + problem['difficulty']) |
| 60 | + urls = leet_lint_url(problem['url']) |
| 61 | + for k, v in urls.items(): |
| 62 | + lines.append('- source({}): <{}>'.format(k, v)) |
| 63 | + lines.append('\n### Description\n') |
| 64 | + lines.append(description_md) |
| 65 | + |
| 66 | + content = '\n'.join(lines) |
| 67 | + yaml_content = YamlContent(metadata, content) |
| 68 | + problem_md = frontmatter.dumps(yaml_content, allow_unicode=True) |
| 69 | + return problem_md |
0 commit comments