mathCrazyy
diff --git a/‎__pycache__/model.cpython-36.pyc
-23 Bytes b/‎__pycache__/model.cpython-36.pyc
-23 Bytes
diff --git a/‎__pycache__/train_eval.cpython-36.pyc
84 Bytes b/‎__pycache__/train_eval.cpython-36.pyc
84 Bytes
diff --git a/‎__pycache__/transformer.cpython-36.pyc
3.21 KB b/‎__pycache__/transformer.cpython-36.pyc
3.21 KB
diff --git a/‎__pycache__/utils.cpython-36.pyc
-23 Bytes b/‎__pycache__/utils.cpython-36.pyc
-23 Bytes
diff --git a/‎load_model.ipynb
Lines changed: 137 additions & 0 deletions b/‎load_model.ipynb
Lines changed: 137 additions & 0 deletions
diff --git a/‎main_zh.py
Lines changed: 14 additions & 4 deletions b/‎main_zh.py
Lines changed: 14 additions & 4 deletions
diff --git a/‎temp.py
Lines changed: 3 additions & 0 deletions b/‎temp.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎test.py
Lines changed: 28 additions & 20 deletions b/‎test.py
Lines changed: 28 additions & 20 deletions
diff --git a/‎train_eval.py
Lines changed: 6 additions & 2 deletions b/‎train_eval.py
Lines changed: 6 additions & 2 deletions
@@ -365,6 +365,143 @@
     "test_sentence(model, TEXT, sentence)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import codecs\n",
+    "path=\"E:/data/word_nlp/cnews_data/bert_embedding\"\n",
+    "\n",
+    "lines = codecs.open(path, encoding=\"utf-8\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "vocab={'<unk>': 0, '<pad>': 1, '，': 2, '的': 3, '。': 4, '是':5}\n",
+    "embedding_vec = [line.replace(\"\\n\", \"\") for line in lines if line.split(\" \")[0] in vocab]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "4"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(embedding_vec)\n",
+    "#embedding_vec\n",
+    "#embedding_vec"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "21131\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "lines = codecs.open(path, encoding=\"utf-8\")\n",
+    "embedding_vec = [line for line in lines ]\n",
+    "print(len(embedding_vec))\n",
+    "#embedding_vec"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "。\n",
+      "是\n",
+      "的\n",
+      "，\n"
+     ]
+    }
+   ],
+   "source": [
+    "for one in embedding_vec:\n",
+    "    if one.split(\" \")[0] in vocab:\n",
+    "        print(one.split(\" \")[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#lines = codecs.open(path, encoding=\"utf-8\")\n",
+    "#for line in list(lines)[0:10]:\n",
+    "#    print(line[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "4"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "vocab[\"。\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "yes\n"
+     ]
+    }
+   ],
+   "source": [
+    "if \"。\" in vocab:\n",
+    "    print(\"yes\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
 
@@ -10,11 +10,13 @@
 from torch.autograd import Variable as V
 
 from model import RNNModel
+from transformer import TransformerModel
 
 class Config(object):
     def __init__(self):
         self.model_name="lm_model"
-        self.data_ori="/mnt/data3/wuchunsheng/data_all/data_mine/lm_data/"
+        #self.data_ori="/mnt/data3/wuchunsheng/data_all/data_mine/lm_data/"
+        self.data_ori="E:/data/word_nlp/cnews_data/"
         self.train_path="train_0.csv"
         self.valid_path="train_0.csv"
         self.test_path="test_100.csv"
@@ -32,12 +34,18 @@ def __init__(self):
         self.hidden_size=200
         self.nlayers=1
         self.dropout=0.5
-        self.epoch=2
+        self.epoch=1
 
         self.train_len=0
         self.test_len = 0
         self.valid_len = 0
         self.mode="train"
+
+        ## transformer的参数
+        self.dropout=0.5
+        self.max_len=5000
+        self.nhead=2
+
 #data_path="E:/study_series/2020_3/re_write_classify/data/"
 #data_path="/mnt/data3/wuchunsheng/code/nlper/NLP_task/text_classification/my_classification_cnews/2020_3_30/text_classify/data/"
 
@@ -47,8 +55,10 @@ def __init__(self):
 
 
 device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-model = RNNModel(config, TEXT).to(device)
+#model = RNNModel(config, TEXT).to(device)
+
+model=TransformerModel(config, TEXT).to(device)
 
-train(config,model,train_iter, valid_iter, test_iter)
+#train(config,model,train_iter, valid_iter, test_iter)
 
 test(config,model,TEXT,  test_iter)## 测试的是一个正批量的
@@ -0,0 +1,3 @@
+import torch
+import torch.nn as nn
+from torch.nn import TransformerEncoder, TransformerEncoderLayer
@@ -11,36 +11,41 @@
 from train_eval import test_sentence,load_model
 
 from model import RNNModel
-
+from transformer import TransformerModel
 
 class Config(object):
     def __init__(self):
-        self.model_name = "lm_model"
-        self.data_ori =  "/mnt/data3/wuchunsheng/data_all/data_mine/lm_data/"
-        self.train_path = "train_0.csv"
-        self.valid_path = "train_0.csv"
-        self.test_path = "test_100.csv"
-        self.sen_max_length = 150
-        # self.embedding_path = "need_bertembedding"
+        self.model_name="lm_model"
+        #self.data_ori="/mnt/data3/wuchunsheng/data_all/data_mine/lm_data/"
+        self.data_ori="E:/data/word_nlp/cnews_data/"
+        self.train_path="train_0.csv"
+        self.valid_path="train_0.csv"
+        self.test_path="test_100.csv"
+        self.sen_max_length=150
+        #self.embedding_path = "need_bertembedding"
         self.embedding_path = "bert_embedding"
-        self.embedding_dim = 768
-        self.vocab_maxsize = 4000
-        self.vocab_minfreq = 10
-        self.save_path = "lm_ckpt"
+        self.embedding_dim=768
+        self.vocab_maxsize=4000
+        self.vocab_minfreq=10
+        self.save_path="lm_ckpt"
 
         self.batch_size = 64
         self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
-        self.hidden_size = 200
-        self.nlayers = 1
-        self.dropout = 0.5
-        self.epoch = 2
+        self.hidden_size=200
+        self.nlayers=1
+        self.dropout=0.5
+        self.epoch=2
 
-        self.train_len = 0
+        self.train_len=0
         self.test_len = 0
         self.valid_len = 0
+        self.mode="test"
 
-        self.mode = "test"
+        ## transformer的参数
+        self.dropout=0.5
+        self.max_len=5000
+        self.nhead=2
 
 
 # data_path="E:/study_series/2020_3/re_write_classify/data/"
@@ -49,10 +54,13 @@ def __init__(self):
 config = Config()
 train_iter, valid_iter, test_iter, TEXT = generate_data(config)
 
-model = RNNModel(config, TEXT).to(config.device)
+#model = RNNModel(config, TEXT).to(config.device)
+model=TransformerModel(config, TEXT).to(config.device)
+
 model =load_model(config, model)
 
-sen="体育"
+sen="comment体育项目"
+sen="".join(['c', 'o', 'n', 't', 'e', 'x', 't', ',', 'l', 'a', 'b', 'e', 'l'])
 res=test_sentence(config, model ,TEXT, sen)
 print(res)
 
@@ -17,7 +17,8 @@ def train(config,model,train_iter, valid_iter,test_iter):
         for batch in train_iter:
             # reset the hidden state or else the model will try to backpropagate to the
             # beginning of the dataset, requiring lots of time and a lot of memory
-            model.reset_history()
+            if(model.mode_type!="transformer"):
+                model.reset_history()
 
             optimizer.zero_grad()
 
@@ -42,7 +43,8 @@ def train(config,model,train_iter, valid_iter,test_iter):
         model.eval()
         # model.train()
         for batch in valid_iter:
-            model.reset_history()
+            if(model.mode_type!="transformer"):
+                model.reset_history()
             text, targets = batch.text.to(config.device), batch.target.to(config.device)
             prediction = model(text).to(config.device)
             loss = criterion(prediction.view(-1, config.n_tokens), targets.view(-1)).to(config.device)
@@ -66,6 +68,7 @@ def test(config, model, TEXT, test_iter):
     print(len(inputs_word))
 
     arrs = model(b.text.cuda()).cuda().data.cpu().numpy()
+    print(arrs.shape)
     preds = word_ids_to_sentence(np.argmax(arrs, axis=2), TEXT.vocab)
 
     print(preds)
@@ -76,6 +79,7 @@ def test_sentence(config, model, TEXT, sentence):
     inputs = inputs.view(-1, 1)
     # print(inputs.shape)
     arrs = model(inputs)
+    print("arrs shape: ",arrs.shape)
     preds = word_ids_to_sentence(np.argmax(arrs.detach().cpu(), axis=2), TEXT.vocab)
 
     return preds
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+import torch`
	`2`	`+import torch.nn as nn`
	`3`	`+from torch.nn import TransformerEncoder, TransformerEncoderLayer`