#52: 修复start end positon 越界问题

cjopengler · cjopengler · commit 2ddd82a6ea92 · 2021-11-22T18:29:30.000+08:00
diff --git a/data/dataset/mrc_msra_ner/sample.json b/data/dataset/mrc_msra_ner/sample.json
@@ -1,25 +1,22 @@
 [
 
   {
-    "context": "因 有 关 日 寇 在 京 掠 夺 文 物 详 情 ， 藏 界 较 为 重 视 ， 也 是 我 们 收 藏 北 京 史 料 中 的 要 件 之 一 。",
+    "context": "根 据 选 举 新 闻 中 心 公 布 的 初 步 统 计 结 果 ， 至 夜 晚 ２ ２ 时 ３ ０ 分 投 票 结 束 ， 全 港 共 有 １ ４ ８ ９ ７ ０ ５ 地 区 直 选 选 民 投 票 ， 投 票 率 达 ５ ３ ． ２ ９ ％ ， 比 １ ９ ９ ５ 年 分 别 增 加 近 ５ ７ 万 人 和 １ ７ 个 多 百 分 点 ， 为 香 港 历 史 最 高 纪 录 ； 共 有 ７ ７ ８ １ ３ 功 能 界 别 选 举 选 民 投 票 ， 投 票 率 达 ６ ３ ． ５ ％ ； 选 举 委 员 会 选 举 投 票 选 民 达 到 ７ ９ ０ 人 ， 投 票 率 高 达 ９ ８ ． ７ ５ ％ 。",
     "end_position": [
-      3,
-      6,
-      28
+      7,
+      129
     ],
-    "entity_label": "NS",
+    "entity_label": "NT",
     "impossible": false,
-    "qas_id": "2.1",
-    "query": "按照地理位置划分的国家,城市,乡镇,大洲",
+    "qas_id": "13563.3",
+    "query": "组织包括公司,政府党派,学校,政府,新闻机构",
     "span_position": [
-      "3;3",
-      "6;6",
-      "27;28"
+      "2;7",
+      "125;129"
     ],
     "start_position": [
-      3,
-      6,
-      27
+      2,
+      125
     ]
   }
 ]
diff --git a/mrc/data/bert_model_collate.py b/mrc/data/bert_model_collate.py
@@ -54,6 +54,16 @@ def __call__(self, instances: List[Instance]) -> MRCModelInputs:
                                                          return_tensors="pt")
 
         batch_token_ids = batch_inputs["input_ids"]
+
+        batch_tokens = list()
+        for batch_token_id in batch_token_ids.tolist():
+            tmp_tokens = list()
+            batch_tokens.append(tmp_tokens)
+            for token_id in batch_token_id:
+                token = self._tokenizer.decode(token_id)
+                tmp_tokens.append(token)
+
+
         batch_token_type_ids = batch_inputs["token_type_ids"]
         batch_max_len = max(batch_inputs["length"])
 
@@ -98,6 +108,9 @@ def __call__(self, instances: List[Instance]) -> MRCModelInputs:
                 origin_offset2token_idx_start = {}
                 origin_offset2token_idx_end = {}
 
+                last_token_start = 0
+                last_token_end = 0
+
                 for token_idx in range(len(token_ids)):
                     # query 的需要过滤
                     if token_type_ids[token_idx] == 0:
@@ -112,14 +125,20 @@ def __call__(self, instances: List[Instance]) -> MRCModelInputs:
                     if token_start == token_end == 0:
                         continue
 
+                    # 保存下最后的 start 和 end
+                    last_token_start = token_start
+                    last_token_end = token_end
+
                     # token_start 对应的就是 context 中的实际位置，与 start_position 与 end_position 是对应的
                     # token_idx 是 query 和 context 拼接在一起后的 index，所以 这就是 start_position 映射后的位置
                     origin_offset2token_idx_start[token_start] = token_idx
                     origin_offset2token_idx_end[token_end] = token_idx
 
                 # 将原始数据中的  start_positions 映射到 拼接 query context 之后的位置
-                new_start_positions = [origin_offset2token_idx_start[start] for start in start_positions]
-                new_end_positions = [origin_offset2token_idx_end[end] for end in end_positions]
+                new_start_positions = [origin_offset2token_idx_start[start] for start in start_positions
+                                       if start <= last_token_start]
+                new_end_positions = [origin_offset2token_idx_end[end] for end in end_positions
+                                     if end <= last_token_end]
 
                 metadata["positions"] = zip(start_positions, end_positions)