Skip to content

Commit f8d3115

Browse files
committed
add text format segmentation
1 parent 31eded6 commit f8d3115

File tree

4 files changed

+24
-6
lines changed

4 files changed

+24
-6
lines changed

EduNLP/SIF/parser/parser.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def _is_formula_legal(self, formula_str):
7676
7777
"""
7878
legal_tags = ['FormFigureID', 'FormFigureBase64', 'FigureID', 'FigureBase64',
79-
'SIFBlank', 'SIFChoice', 'SIFTag', 'SIFSep', 'SIFUnderline']
79+
'SIFBlank', 'SIFChoice', 'SIFTag', 'SIFSep', 'SIFUnderline', 'textf']
8080
for tag in legal_tags:
8181
if tag in formula_str:
8282
return True

EduNLP/SIF/segment/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22
# 2021/5/18 @ tongshiwei
33

44
from .segment import (SegmentList, TextSegment, FigureFormulaSegment, LatexFormulaSegment, FigureSegment,
5-
QuesMarkSegment, Figure, TagSegment, SepSegment, seg)
5+
QuesMarkSegment, Figure, TagSegment, SepSegment, seg, TextFSegment)

EduNLP/SIF/segment/segment.py

+17-3
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,10 @@ class SepSegment(str):
7575
pass
7676

7777

78+
class TextFSegment(str):
79+
pass
80+
81+
7882
class SegmentList(object):
7983
def __init__(self, item, figures: dict = None):
8084
self._segments = []
@@ -104,6 +108,9 @@ def __init__(self, item, figures: dict = None):
104108
self.append(TagSegment(segment[1:-1]))
105109
elif re.match(r"\$\\SIFSep\$", segment):
106110
self.append(SepSegment(segment[1:-1]))
111+
elif re.match(r"\$\\textf\{[^,]+?,b?d?i?t?u?w?}\$", segment):
112+
seg_capture = re.match(r"\$\\textf\{([^,]+?),b?d?i?t?u?w?}\$", segment)
113+
self.append(TextFSegment(seg_capture.group(1)))
107114
else:
108115
self.append(LatexFormulaSegment(segment[1:-1]))
109116
self._seg_idx = None
@@ -115,8 +122,12 @@ def __len__(self):
115122
return len(self._segments)
116123

117124
def append(self, segment) -> None:
118-
if isinstance(segment, TextSegment):
119-
self._text_segments.append(len(self))
125+
if isinstance(segment, TextSegment) or isinstance(segment, TextFSegment):
126+
if len(self._text_segments) != 0 and self._text_segments[-1] == len(self) - 1:
127+
self._segments[-1] = self._segments[-1] + segment
128+
else:
129+
self._text_segments.append(len(self))
130+
self._segments.append(segment)
120131
elif isinstance(segment, (LatexFormulaSegment, FigureFormulaSegment)):
121132
self._formula_segments.append(len(self))
122133
elif isinstance(segment, FigureSegment):
@@ -129,7 +140,10 @@ def append(self, segment) -> None:
129140
self._sep_segments.append(len(self))
130141
else:
131142
raise TypeError("Unknown Segment Type: %s" % type(segment))
132-
self._segments.append(segment)
143+
if isinstance(segment, TextFSegment) or isinstance(segment, TextSegment):
144+
pass
145+
else:
146+
self._segments.append(segment)
133147

134148
@property
135149
def segments(self):

tests/test_sif/test_segement.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ def test_segment(figure0, figure1, figure0_base64, figure1_base64):
1919
r"如图所示,则$\FormFigureBase64{%s}$的面积是$\SIFBlank$。$\FigureBase64{%s}$" % (figure0_base64, figure1_base64),
2020
figures=True
2121
)
22-
2322
with pytest.raises(TypeError):
2423
s.append("123")
24+
seg_test_text = seg(
25+
r"如图所示,有三组$\textf{机器人,bu}$在踢$\textf{足球,b}$",
26+
figures=True
27+
)
28+
assert seg_test_text.text_segments == ['如图所示,有三组机器人在踢足球']

0 commit comments

Comments
 (0)