applenob
diff --git a/Diff for: ‎.gitignore
+1 b/Diff for: ‎.gitignore
+1
diff --git a/Diff for: ‎README.md
+25-1 b/Diff for: ‎README.md
+25-1
diff --git a/Diff for: ‎baseline_class.py
+171 b/Diff for: ‎baseline_class.py
+171
diff --git a/Diff for: ‎compare_with_big_data.py
+164 b/Diff for: ‎compare_with_big_data.py
+164
@@ -7,3 +7,4 @@ model_vars
 play_ground.py
 *.pkl
 tri_*
+test/*.txt
@@ -70,12 +70,36 @@ token的所有类别：`['PLAIN', 'PUNCT', 'DATE', 'LETTERS', 'CARDINAL', 'VERBA
 
 ## 比赛记录
 
-###2017-11-15
+### 2017-11-15
 
 比赛第一天，新的测试数据发布。按照之前的模型跑一版，到0.9933，直接窜到第三名。也不知道可以坚挺多久。
 
 使用大字典的方法跑了一个baseline：0.9893，用作后续提高的对比数据。
 
+### 2017-11-17
+
+寻找错误大概从两个方向上去找：1.找分类器分错的类别。2.找normalizer没有换对的情况。
+
+分类器分错的类别可以通过分类器输出的分类信心概率来获取。比如使用xgboost中的`'objective': 'multi:softprob'`参数设定。检查prob值特别小的数据。
+
+通过最终结果去找可以直接使用大字典跑出来的baseline去比较不同，但这个方法也不尽靠谱，不同的不一定错，相同的不一定对。但也可以从中观察，找到一些规律。
+
+还可以直接在最终的结果中去找数字和特殊符号，一般是normalize失败的情况，这个方法可以查出来。
+
+目前是0.9947。
+
+### 2017-11-20
+
+今天是比赛最后一天，发现之前xgboost的参数设置有问题： `'nthread': -1`，直接删掉，因为默认值是最大值。果然重新运行之后cpu的8核满负荷运行。快了很多。
+
+另外把之前的基于context的方法做了一下修正：之前每个token都会包括上一个token和下一个token。那么句子中的最后一个token的context会包含下一句的第一个token，句子的第一个token的context会包含上一句的最后一个token。这显然是不对的。因此在每句之间加入全0向量。忽略该向量本身，只是用于其他向量的context。
+
+最后一天重新check所有类别的rule函数，发现还有不错的可改进空间。
+
+还能继续改的：
+- `ORDINAL`: 前面有没有the的问题。
+- `VERBATIM`: `#` 是hash-tag还是number的问题。
+
 ## 其他信息
 
 ### 使用到的第三方包
 
@@ -0,0 +1,171 @@
+# coding=utf-8
+# @author: cer
+# use python3
+from __future__ import print_function
+import os
+import operator
+from num2words import num2words  # 这个包不支持中文
+import gc
+import pandas as pd
+import numpy as np
+import time
+import pickle as pkl
+
+train_file_name = "input/en_train.csv"
+test_file = 'input/en_test_2.csv'
+baseline_file = 'output/baseline_class.csv'
+pkl_name = "output/class_dict.pkl"
+train_df = pd.read_csv(train_file_name)
+test_df = pd.read_csv(test_file)
+
+
+def train():
+    print('Train start...')
+    if os.path.exists(pkl_name):
+        with open(pkl_name, "rb") as f:
+            res = pkl.load(f)
+    else:
+        # Work with primary dataset
+        train_file = open(train_file_name, encoding='UTF8')
+        train_file.readline()
+        res = dict()
+        total = 0
+        not_same = 0
+        while 1:
+            line = train_file.readline().strip()
+            if line == '':
+                break
+            total += 1
+            pos = line.find('","')
+            text = line[pos + 2:]
+            if text[:3] == '","':
+                continue
+            text = text[1:-1]
+            arr = text.split('","')
+            if arr[0] != arr[1]:
+                not_same += 1
+            if arr[0] not in res:
+                res[arr[0]] = dict()
+                res[arr[0]][arr[1]] = 1
+            else:
+                if arr[1] in res[arr[0]]:
+                    res[arr[0]][arr[1]] += 1
+                else:
+                    res[arr[0]][arr[1]] = 1
+        train_file.close()
+        print(train_file_name + ':\tTotal: {} Have diff value: {}'.format(total, not_same))
+
+        # Work with additional dataset from https://www.kaggle.com/google-nlu/text-normalization
+        files = ['output_1.csv', 'output_6.csv', 'output_11.csv', 'output_16.csv', \
+                 'output_21.csv', 'output_91.csv', 'output_96.csv']
+
+        for add_file_name in files:
+            train_file = open(os.path.join("input", 'tn', add_file_name), encoding='UTF8')
+            train_file.readline()
+            while 1:
+                line = train_file.readline().strip()
+                if line == '':
+                    break
+                line = line.replace(',NA,', ',"NA",')
+                total += 1
+                pos = line.find('","')
+                text = line[pos + 2:]
+                if text[:3] == '","':
+                    continue
+                text = text[1:-1]
+                arr = text.split('","')
+                if arr[0] == '<eos>':
+                    continue
+                if arr[1] != '<self>':
+                    not_same += 1
+
+                if arr[1] == '<self>' or arr[1] == 'sil':
+                    arr[1] = arr[0]
+
+                if arr[0] not in res:
+                    res[arr[0]] = dict()
+                    res[arr[0]][arr[1]] = 1
+                else:
+                    if arr[1] in res[arr[0]]:
+                        res[arr[0]][arr[1]] += 1
+                    else:
+                        res[arr[0]][arr[1]] = 1
+            train_file.close()
+            print(add_file_name + ':\tTotal: {} Have diff value: {}'.format(total, not_same))
+
+    return res
+
+
+def solve(res):
+    sdict = {}
+    sdict['km2'] = 'square kilometers'
+    sdict['km'] = 'kilometers'
+    sdict['kg'] = 'kilograms'
+    sdict['lb'] = 'pounds'
+    sdict['dr'] = 'doctor'
+    sdict['m²'] = 'square meters'
+
+    total = 0
+    changes = 0
+    out = open(baseline_file, "w", encoding='UTF8')
+    out.write('"id","after"\n')
+    test = open(test_file, encoding='UTF8')
+    test.readline().strip()
+    while 1:
+        line = test.readline().strip()
+        if line == '':
+            break
+
+        pos = line.find(',')
+        i1 = line[:pos]
+        line = line[pos + 1:]
+
+        pos = line.find(',')
+        i2 = line[:pos]
+        line = line[pos + 1:]
+
+        line = line[1:-1]
+        out.write('"' + i1 + '_' + i2 + '",')
+        if line in res:
+            srtd = sorted(res[line].items(), key=operator.itemgetter(1), reverse=True)
+            out.write('"' + srtd[0][0] + '"')
+            changes += 1
+        else:
+            # line.split(' ')
+            if len(line) > 1:
+                val = line.split(',')
+                if len(val) == 2 and val[0].isdigit and val[1].isdigit:
+                    line = ''.join(val)
+
+            if line.isdigit():
+                srtd = line.translate(SUB)
+                srtd = srtd.translate(SUP)
+                srtd = srtd.translate(OTH)
+                out.write('"' + num2words(float(srtd)) + '"')
+                changes += 1
+            elif len(line.split(' ')) > 1:
+                val = line.split(' ')
+                for i, v in enumerate(val):
+                    if v.isdigit():
+                        srtd = v.translate(SUB)
+                        srtd = srtd.translate(SUP)
+                        srtd = srtd.translate(OTH)
+                        val[i] = num2words(float(srtd))
+                    elif v in sdict:
+                        val[i] = sdict[v]
+
+                out.write('"' + ' '.join(val) + '"')
+                changes += 1
+            else:
+                out.write('"' + line + '"')
+
+        out.write('\n')
+        total += 1
+
+    print('Total: {} Changed: {}'.format(total, changes))
+    test.close()
+    out.close()
+
+if __name__ == '__main__':
+    res = train()
+    solve(res)
@@ -0,0 +1,164 @@
+# coding=utf-8
+# @author: cer
+# use python3
+from __future__ import print_function
+import os
+import operator
+from num2words import num2words  # 这个包不支持中文
+import gc
+import pandas as pd
+import numpy as np
+import time
+
+train_file_name = "input/en_train.csv"
+test_file = 'input/en_test_2.csv'
+baseline_file = 'output/baseline.csv'
+train_df = pd.read_csv(train_file_name)
+test_df = pd.read_csv(test_file)
+
+SUB = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")
+SUP = str.maketrans("⁰¹²³⁴⁵⁶⁷⁸⁹", "0123456789")
+OTH = str.maketrans("፬", "4")
+
+
+def train():
+    print('Train start...')
+
+    # Work with primary dataset
+    train_file = open(train_file_name, encoding='UTF8')
+    train_file.readline()
+    res = dict()
+    total = 0
+    not_same = 0
+    while 1:
+        line = train_file.readline().strip()
+        if line == '':
+            break
+        total += 1
+        pos = line.find('","')
+        text = line[pos + 2:]
+        if text[:3] == '","':
+            continue
+        text = text[1:-1]
+        arr = text.split('","')
+        if arr[0] != arr[1]:
+            not_same += 1
+        if arr[0] not in res:
+            res[arr[0]] = dict()
+            res[arr[0]][arr[1]] = 1
+        else:
+            if arr[1] in res[arr[0]]:
+                res[arr[0]][arr[1]] += 1
+            else:
+                res[arr[0]][arr[1]] = 1
+    train_file.close()
+    print(train_file_name + ':\tTotal: {} Have diff value: {}'.format(total, not_same))
+
+    # Work with additional dataset from https://www.kaggle.com/google-nlu/text-normalization
+    files = ['output_1.csv', 'output_6.csv', 'output_11.csv', 'output_16.csv', \
+             'output_21.csv', 'output_91.csv', 'output_96.csv']
+
+    for add_file_name in files:
+        train_file = open(os.path.join("input", 'tn', add_file_name), encoding='UTF8')
+        train_file.readline()
+        while 1:
+            line = train_file.readline().strip()
+            if line == '':
+                break
+            line = line.replace(',NA,', ',"NA",')
+            total += 1
+            pos = line.find('","')
+            text = line[pos + 2:]
+            if text[:3] == '","':
+                continue
+            text = text[1:-1]
+            arr = text.split('","')
+            if arr[0] == '<eos>':
+                continue
+            if arr[1] != '<self>':
+                not_same += 1
+
+            if arr[1] == '<self>' or arr[1] == 'sil':
+                arr[1] = arr[0]
+
+            if arr[0] not in res:
+                res[arr[0]] = dict()
+                res[arr[0]][arr[1]] = 1
+            else:
+                if arr[1] in res[arr[0]]:
+                    res[arr[0]][arr[1]] += 1
+                else:
+                    res[arr[0]][arr[1]] = 1
+        train_file.close()
+        print(add_file_name + ':\tTotal: {} Have diff value: {}'.format(total, not_same))
+
+    return res
+
+
+def solve(res):
+
+
+    total = 0
+    changes = 0
+    out = open(baseline_file, "w", encoding='UTF8')
+    out.write('"id","after"\n')
+    test = open(test_file, encoding='UTF8')
+    test.readline().strip()
+    while 1:
+        line = test.readline().strip()
+        if line == '':
+            break
+
+        pos = line.find(',')
+        i1 = line[:pos]
+        line = line[pos + 1:]
+
+        pos = line.find(',')
+        i2 = line[:pos]
+        line = line[pos + 1:]
+
+        line = line[1:-1]
+        out.write('"' + i1 + '_' + i2 + '",')
+        if line in res:
+            srtd = sorted(res[line].items(), key=operator.itemgetter(1), reverse=True)
+            out.write('"' + srtd[0][0] + '"')
+            changes += 1
+        else:
+            # line.split(' ')
+            if len(line) > 1:
+                val = line.split(',')
+                if len(val) == 2 and val[0].isdigit and val[1].isdigit:
+                    line = ''.join(val)
+
+            if line.isdigit():
+                srtd = line.translate(SUB)
+                srtd = srtd.translate(SUP)
+                srtd = srtd.translate(OTH)
+                out.write('"' + num2words(float(srtd)) + '"')
+                changes += 1
+            elif len(line.split(' ')) > 1:
+                val = line.split(' ')
+                for i, v in enumerate(val):
+                    if v.isdigit():
+                        srtd = v.translate(SUB)
+                        srtd = srtd.translate(SUP)
+                        srtd = srtd.translate(OTH)
+                        val[i] = num2words(float(srtd))
+                    elif v in sdict:
+                        val[i] = sdict[v]
+
+                out.write('"' + ' '.join(val) + '"')
+                changes += 1
+            else:
+                out.write('"' + line + '"')
+
+        out.write('\n')
+        total += 1
+
+    print('Total: {} Changed: {}'.format(total, changes))
+    test.close()
+    out.close()
+
+if __name__ == '__main__':
+    res = train()
+    solve(res)