|
11 | 11 | newline_whitespace_re = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
|
12 | 12 | html_heading_re = re.compile(r'h[1-6]')
|
13 | 13 |
|
| 14 | +# extract (leading_nl, content, trailing_nl) from a string |
| 15 | +# (functionally equivalent to r'^(\n*)(.*?)(\n*)$', but greedy is faster than reluctant here) |
| 16 | +extract_newlines_re = re.compile(r'^(\n*)((?:.*[^\n])?)(\n*)$', flags=re.DOTALL) |
| 17 | + |
14 | 18 |
|
15 | 19 | # Heading styles
|
16 | 20 | ATX = 'atx'
|
@@ -168,6 +172,12 @@ def convert(self, html):
|
168 | 172 | def convert_soup(self, soup):
|
169 | 173 | return self.process_tag(soup, convert_as_inline=False)
|
170 | 174 |
|
| 175 | + def process_element(self, node, convert_as_inline): |
| 176 | + if isinstance(node, NavigableString): |
| 177 | + return self.process_text(node) |
| 178 | + else: |
| 179 | + return self.process_tag(node, convert_as_inline) |
| 180 | + |
171 | 181 | def process_tag(self, node, convert_as_inline):
|
172 | 182 | text = ''
|
173 | 183 |
|
@@ -203,23 +213,44 @@ def _can_ignore(el):
|
203 | 213 | return True
|
204 | 214 | else:
|
205 | 215 | return False
|
| 216 | + elif el is None: |
| 217 | + return True |
206 | 218 | else:
|
207 | 219 | raise ValueError('Unexpected element type: %s' % type(el))
|
208 | 220 |
|
209 |
| - children_to_convert = [child for child in node.children if not _can_ignore(child)] |
| 221 | + children_to_convert = [el for el in node.children if not _can_ignore(el)] |
210 | 222 |
|
211 |
| - # Convert the children first |
212 |
| - for el in children_to_convert: |
213 |
| - if isinstance(el, NavigableString): |
214 |
| - text += self.process_text(el) |
215 |
| - else: |
216 |
| - text_strip = text.rstrip('\n') |
217 |
| - newlines_left = len(text) - len(text_strip) |
218 |
| - next_text = self.process_tag(el, convert_children_as_inline) |
219 |
| - next_text_strip = next_text.lstrip('\n') |
220 |
| - newlines_right = len(next_text) - len(next_text_strip) |
221 |
| - newlines = '\n' * max(newlines_left, newlines_right) |
222 |
| - text = text_strip + newlines + next_text_strip |
| 223 | + # Convert the children elements into a list of result strings. |
| 224 | + child_strings = [self.process_element(el, convert_children_as_inline) for el in children_to_convert] |
| 225 | + |
| 226 | + # Remove empty string values. |
| 227 | + child_strings = [s for s in child_strings if s] |
| 228 | + |
| 229 | + # Collapse newlines at child element boundaries, if needed. |
| 230 | + if node.name == 'pre' or node.find_parent('pre'): |
| 231 | + # Inside <pre> blocks, do not collapse newlines. |
| 232 | + pass |
| 233 | + else: |
| 234 | + # Collapse newlines at child element boundaries. |
| 235 | + updated_child_strings = [''] # so the first lookback works |
| 236 | + for child_string in child_strings: |
| 237 | + # Separate the leading/trailing newlines from the content. |
| 238 | + leading_nl, content, trailing_nl = extract_newlines_re.match(child_string).groups() |
| 239 | + |
| 240 | + # If the last child had trailing newlines and this child has leading newlines, |
| 241 | + # use the larger newline count, limited to 2. |
| 242 | + if updated_child_strings[-1] and leading_nl: |
| 243 | + prev_trailing_nl = updated_child_strings.pop() # will be replaced by the collapsed value |
| 244 | + num_newlines = min(2, max(len(prev_trailing_nl), len(leading_nl))) |
| 245 | + leading_nl = '\n' * num_newlines |
| 246 | + |
| 247 | + # Add the results to the updated child string list. |
| 248 | + updated_child_strings.extend([leading_nl, content, trailing_nl]) |
| 249 | + |
| 250 | + child_strings = updated_child_strings |
| 251 | + |
| 252 | + # Join all child text strings into a single string. |
| 253 | + text = ''.join(child_strings) |
223 | 254 |
|
224 | 255 | # apply this tag's final conversion function
|
225 | 256 | convert_fn_name = "convert_%s" % re.sub(r"[\[\]:-]", "_", node.name)
|
|
0 commit comments