Skip to content
This repository was archived by the owner on Dec 19, 2018. It is now read-only.

Commit 181f0a8

Browse files
committed
Merge original repo's PR: Kyubyong#13
1 parent 5604ff8 commit 181f0a8

File tree

5 files changed

+63
-25
lines changed

5 files changed

+63
-25
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ This project has two purposes. First of all, I'd like to share some of my experi
2828

2929
## Work Flow
3030

31-
* STEP 1. Download the [wikipedia database backup dumps](https://dumps.wikimedia.org/backup-index.html) of the language you want (for example, for english wiki go to `https://dumps.wikimedia.org/enwiki/` click the latest timestamp, and download the `enwiki-YYYYMMDD-pages-articles-multistream.xml.bz2` file).
31+
* STEP 1-1. Download the [wikipedia database backup dumps](https://dumps.wikimedia.org/backup-index.html) of the language you want (for example, for english wiki go to `https://dumps.wikimedia.org/enwiki/` click the latest timestamp, and download the `enwiki-YYYYMMDD-pages-articles-multistream.xml.bz2` file).
32+
* STEP 1-2. Install requirements packages
3233
* STEP 2. Extract running texts to `data/` folder.
3334
* STEP 3. Run `build_corpus.py`.
3435
* STEP 4-1. Run `make_wordvector.sh` to get Word2Vec word vectors.

build_corpus.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
# coding: utf-8
2-
#!/usr/bin/python2
32
import argparse
43
import codecs
54
import lxml.etree as ET
@@ -10,37 +9,40 @@
109
parser = argparse.ArgumentParser()
1110
parser.add_argument('--lcode', help='ISO 639-1 code of target language. See `lcodes.txt`.')
1211
parser.add_argument('--max_corpus_size', type=int, default=1000000000, help='the maximum size of the corpus. Feel free to adjust it according to your computing power.')
12+
parser.add_argument('--wiki_dump_version', help='version of wikimedia dumps')
13+
1314
args = parser.parse_args()
1415

1516
lcode = args.lcode
17+
wiki_dump_version = args.wiki_dump_version
1618
if lcode == 'ko':
1719
from konlpy.tag import Kkma # pip install konlpy. See http://konlpy.org/en/v0.4.4/ for further information.
1820
kkma = Kkma()
19-
print "kkma succesfuly loaded!"
21+
print("kkma succesfuly loaded!")
2022
elif lcode == 'ja':
2123
import MeCab # See https://pypi.python.org/pypi/mecab-python/0.996
2224
mecab = MeCab.Tagger("-Owakati")
23-
print "mecab succesfuly loaded!"
25+
print("mecab succesfuly loaded!")
2426
elif lcode == 'zh':
2527
import jieba # See https://pypi.python.org/pypi/jieba/
26-
print "jieba succesfuly loaded!"
28+
print("jieba succesfuly loaded!")
2729
elif lcode == 'vi':
2830
from pyvi.pyvi import ViTokenizer # See https://pypi.python.org/pypi/pyvi
29-
print "pyvi succesfuly loaded!"
31+
print("pyvi succesfuly loaded!")
3032
elif lcode == 'th':
3133
import pythai # See https://pypi.python.org/pypi/pythai
32-
print "pythai succesfuly loaded!"
34+
print("pythai succesfuly loaded!")
3335
# elif lcode == 'ar':
3436
# os.environ['CLASSPATH'] = "../stanford-segmenter-2015-12-09"
3537
# from nltk.tokenize.stanford_segmenter import StanfordSegmenter
3638
# segmenter = StanfordSegmenter(path_to_jar="../stanford-segmenter-2015-12-09/stanford-segmenter-3.6.0.jar",
3739
# path_to_sihan_corpora_dict="../stanford-segmenter-2015-12-09/data",
3840
# path_to_model="../stanford-segmenter-2015-12-09/data/pku.gz",
3941
# path_to_dict="../stanford-segmenter-2015-12-09/data/dict-chris6.ser.gz")
40-
# print "StanfordSegmenter succesfuly loaded!"
42+
# print ("StanfordSegmenter succesfuly loaded!")
4143

4244
max_corpus_size = args.max_corpus_size
43-
fname = "{}wiki-20161201-pages-articles-multistream.xml".format(lcode)
45+
fname = "{}wiki-{}-pages-articles-multistream.xml".format(lcode, wiki_dump_version)
4446

4547
def clean_text(text):
4648
global lcode
@@ -157,7 +159,7 @@ def build_corpus():
157159
continue # it's okay as we have a pretty big corpus!
158160
elem.clear() # We need to save memory!
159161
if i % 1000 == 0:
160-
print i,
162+
print(i),
161163
fsize = os.path.getsize("data/{}.txt".format(lcode))
162164
if fsize > max_corpus_size:
163165
break
@@ -166,4 +168,4 @@ def build_corpus():
166168
if __name__ == "__main__":
167169
build_corpus()
168170

169-
print "Done"
171+
print("Done")

make_wordvectors.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
# coding: utf-8
2-
#!/usr/bin/python2
32
import nltk
4-
import os
53
import codecs
64
import argparse
75
import numpy as np
6+
import sys
87

98
# arguments setting
109
parser = argparse.ArgumentParser()
@@ -40,9 +39,13 @@ def get_min_count(sents):
4039
def make_wordvectors():
4140
global lcode
4241
import gensim # In case you have difficulties installing gensim, you need to consider installing conda.
43-
import cPickle as pickle
42+
43+
if sys.version_info[0] >= 3:
44+
import pickle
45+
else:
46+
import cPickle as pickle
4447

45-
print "Making sentences as list..."
48+
print("Making sentences as list...")
4649
sents = []
4750
with codecs.open('data/{}.txt'.format(lcode), 'r', 'utf-8') as fin:
4851
while 1:
@@ -52,7 +55,7 @@ def make_wordvectors():
5255
words = line.split()
5356
sents.append(words)
5457

55-
print "Making word vectors..."
58+
print("Making word vectors...")
5659
min_count = get_min_count(sents)
5760
model = gensim.models.Word2Vec(sents, size=vector_size, min_count=min_count,
5861
negative=num_negative,
@@ -62,11 +65,11 @@ def make_wordvectors():
6265

6366
# Save to file
6467
with codecs.open('data/{}.tsv'.format(lcode), 'w', 'utf-8') as fout:
65-
for i, word in enumerate(model.index2word):
68+
for i, word in enumerate(model.wv.index2word):
6669
fout.write(u"{}\t{}\t{}\n".format(str(i), word.encode('utf8').decode('utf8'),
6770
np.array_str(model[word])
6871
))
6972
if __name__ == "__main__":
7073
make_wordvectors()
7174

72-
print "Done"
75+
print("Done")

make_wordvectors.sh

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,27 +2,31 @@
22

33
#### Set your hyper-parameters here ####
44
############## START ###################
5-
lcode="xx" # ISO 639-1 code of target language. See `lcodes.txt`.
6-
max_corpus_size=1000000000 # the maximum size of the corpus. Feel free to adjust it according to your computing power.
5+
lcode="id" # ISO 639-1 code of target language. See `lcodes.txt`.
6+
wiki_dump_version="20180120" # version of wikimedia dumps
7+
max_corpus_size=10000000000 # the maximum size of the corpus. Feel free to adjust it according to your computing power.
78
vector_size=300 # the size of a word vector
89
window_size=5 # the maximum distance between the current and predicted word within a sentence.
910
vocab_size=20000 # the maximum vocabulary size
1011
num_negative=5 # the int for negative specifies how many “noise words” should be drawn
1112
############## END #####################
1213

13-
echo "step 0. Make `data` directory and move there.`
14+
echo "step 0. Install packages according to requirements.txt"
15+
pip install -r requirements.txt
16+
17+
echo "step 0. Make `data` directory and move there."
1418
mkdir data; cd data
1519

1620
echo "step 1. Download the stored wikipedia file to your disk."
17-
wget "https://dumps.wikimedia.org/${lcode}wiki/20161201/${lcode}wiki-20161201-pages-articles-multistream.xml.bz2"
21+
rm -rf ${lcode}wiki-${wiki_dump_version}-pages-articles-multistream*
22+
wget "https://dumps.wikimedia.org/${lcode}wiki/${wiki_dump_version}/${lcode}wiki-${wiki_dump_version}-pages-articles-multistream.xml.bz2"
1823

1924
echo "step 2. Extract the bz2 file."
20-
bzip2 -d "${lcode}wiki-20161201-pages-articles-multistream.xml.bz2"
25+
bzip2 -d "${lcode}wiki-${wiki_dump_version}-pages-articles-multistream.xml.bz2"
2126

2227
cd ..
2328
echo "step 3. Build Corpus."
24-
python build_corpus.py --lcode=${lcode} --max_corpus_size=${max_corpus_size}
29+
python build_corpus.py --lcode=${lcode} --max_corpus_size=${max_corpus_size} --wiki_dump_version=${wiki_dump_version}
2530

2631
echo "step 4. make wordvectors"
2732
python make_wordvectors.py --lcode=${lcode} --vector_size=${vector_size} --window_size=${window_size} --vocab_size=${vocab_size} --num_negative=${num_negative}
28-

requirements.txt

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
boto==2.48.0
2+
boto3==1.5.20
3+
botocore==1.8.34
4+
bz2file==0.98
5+
certifi==2018.1.18
6+
chardet==3.0.4
7+
docutils==0.14
8+
gensim==3.2.0
9+
idna==2.6
10+
jmespath==0.9.3
11+
konlpy==0.4.4
12+
lxml==4.1.1
13+
nltk==3.2.5
14+
numpy==1.14.0
15+
python-crfsuite==0.9.5
16+
python-dateutil==2.6.1
17+
pyvi==0.0.8.0
18+
regex==2018.1.10
19+
requests==2.18.4
20+
s3transfer==0.1.12
21+
scikit-learn==0.19.1
22+
scipy==1.0.0
23+
six==1.11.0
24+
sklearn-crfsuite==0.3.6
25+
smart-open==1.5.6
26+
tabulate==0.8.2
27+
tqdm==4.19.5
28+
urllib3==1.22

0 commit comments

Comments
 (0)