Skip to content
This repository was archived by the owner on Dec 19, 2018. It is now read-only.

Commit

Permalink
Merge original repo's PR: Kyubyong#13
Browse files Browse the repository at this point in the history
  • Loading branch information
Astro36 committed Mar 25, 2018
1 parent 5604ff8 commit 181f0a8
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 25 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ This project has two purposes. First of all, I'd like to share some of my experi

## Work Flow

* STEP 1. Download the [wikipedia database backup dumps](https://dumps.wikimedia.org/backup-index.html) of the language you want (for example, for english wiki go to `https://dumps.wikimedia.org/enwiki/` click the latest timestamp, and download the `enwiki-YYYYMMDD-pages-articles-multistream.xml.bz2` file).
* STEP 1-1. Download the [wikipedia database backup dumps](https://dumps.wikimedia.org/backup-index.html) of the language you want (for example, for english wiki go to `https://dumps.wikimedia.org/enwiki/` click the latest timestamp, and download the `enwiki-YYYYMMDD-pages-articles-multistream.xml.bz2` file).
* STEP 1-2. Install requirements packages
* STEP 2. Extract running texts to `data/` folder.
* STEP 3. Run `build_corpus.py`.
* STEP 4-1. Run `make_wordvector.sh` to get Word2Vec word vectors.
Expand Down
22 changes: 12 additions & 10 deletions build_corpus.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# coding: utf-8
#!/usr/bin/python2
import argparse
import codecs
import lxml.etree as ET
Expand All @@ -10,37 +9,40 @@
parser = argparse.ArgumentParser()
parser.add_argument('--lcode', help='ISO 639-1 code of target language. See `lcodes.txt`.')
parser.add_argument('--max_corpus_size', type=int, default=1000000000, help='the maximum size of the corpus. Feel free to adjust it according to your computing power.')
parser.add_argument('--wiki_dump_version', help='version of wikimedia dumps')

args = parser.parse_args()

lcode = args.lcode
wiki_dump_version = args.wiki_dump_version
if lcode == 'ko':
from konlpy.tag import Kkma # pip install konlpy. See http://konlpy.org/en/v0.4.4/ for further information.
kkma = Kkma()
print "kkma succesfuly loaded!"
print("kkma succesfuly loaded!")
elif lcode == 'ja':
import MeCab # See https://pypi.python.org/pypi/mecab-python/0.996
mecab = MeCab.Tagger("-Owakati")
print "mecab succesfuly loaded!"
print("mecab succesfuly loaded!")
elif lcode == 'zh':
import jieba # See https://pypi.python.org/pypi/jieba/
print "jieba succesfuly loaded!"
print("jieba succesfuly loaded!")
elif lcode == 'vi':
from pyvi.pyvi import ViTokenizer # See https://pypi.python.org/pypi/pyvi
print "pyvi succesfuly loaded!"
print("pyvi succesfuly loaded!")
elif lcode == 'th':
import pythai # See https://pypi.python.org/pypi/pythai
print "pythai succesfuly loaded!"
print("pythai succesfuly loaded!")
# elif lcode == 'ar':
# os.environ['CLASSPATH'] = "../stanford-segmenter-2015-12-09"
# from nltk.tokenize.stanford_segmenter import StanfordSegmenter
# segmenter = StanfordSegmenter(path_to_jar="../stanford-segmenter-2015-12-09/stanford-segmenter-3.6.0.jar",
# path_to_sihan_corpora_dict="../stanford-segmenter-2015-12-09/data",
# path_to_model="../stanford-segmenter-2015-12-09/data/pku.gz",
# path_to_dict="../stanford-segmenter-2015-12-09/data/dict-chris6.ser.gz")
# print "StanfordSegmenter succesfuly loaded!"
# print ("StanfordSegmenter succesfuly loaded!")

max_corpus_size = args.max_corpus_size
fname = "{}wiki-20161201-pages-articles-multistream.xml".format(lcode)
fname = "{}wiki-{}-pages-articles-multistream.xml".format(lcode, wiki_dump_version)

def clean_text(text):
global lcode
Expand Down Expand Up @@ -157,7 +159,7 @@ def build_corpus():
continue # it's okay as we have a pretty big corpus!
elem.clear() # We need to save memory!
if i % 1000 == 0:
print i,
print(i),
fsize = os.path.getsize("data/{}.txt".format(lcode))
if fsize > max_corpus_size:
break
Expand All @@ -166,4 +168,4 @@ def build_corpus():
if __name__ == "__main__":
build_corpus()

print "Done"
print("Done")
17 changes: 10 additions & 7 deletions make_wordvectors.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
# coding: utf-8
#!/usr/bin/python2
import nltk
import os
import codecs
import argparse
import numpy as np
import sys

# arguments setting
parser = argparse.ArgumentParser()
Expand Down Expand Up @@ -40,9 +39,13 @@ def get_min_count(sents):
def make_wordvectors():
global lcode
import gensim # In case you have difficulties installing gensim, you need to consider installing conda.
import cPickle as pickle

if sys.version_info[0] >= 3:
import pickle
else:
import cPickle as pickle

print "Making sentences as list..."
print("Making sentences as list...")
sents = []
with codecs.open('data/{}.txt'.format(lcode), 'r', 'utf-8') as fin:
while 1:
Expand All @@ -52,7 +55,7 @@ def make_wordvectors():
words = line.split()
sents.append(words)

print "Making word vectors..."
print("Making word vectors...")
min_count = get_min_count(sents)
model = gensim.models.Word2Vec(sents, size=vector_size, min_count=min_count,
negative=num_negative,
Expand All @@ -62,11 +65,11 @@ def make_wordvectors():

# Save to file
with codecs.open('data/{}.tsv'.format(lcode), 'w', 'utf-8') as fout:
for i, word in enumerate(model.index2word):
for i, word in enumerate(model.wv.index2word):
fout.write(u"{}\t{}\t{}\n".format(str(i), word.encode('utf8').decode('utf8'),
np.array_str(model[word])
))
if __name__ == "__main__":
make_wordvectors()

print "Done"
print("Done")
18 changes: 11 additions & 7 deletions make_wordvectors.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,31 @@

#### Set your hyper-parameters here ####
############## START ###################
lcode="xx" # ISO 639-1 code of target language. See `lcodes.txt`.
max_corpus_size=1000000000 # the maximum size of the corpus. Feel free to adjust it according to your computing power.
lcode="id" # ISO 639-1 code of target language. See `lcodes.txt`.
wiki_dump_version="20180120" # version of wikimedia dumps
max_corpus_size=10000000000 # the maximum size of the corpus. Feel free to adjust it according to your computing power.
vector_size=300 # the size of a word vector
window_size=5 # the maximum distance between the current and predicted word within a sentence.
vocab_size=20000 # the maximum vocabulary size
num_negative=5 # the int for negative specifies how many “noise words” should be drawn
############## END #####################

echo "step 0. Make `data` directory and move there.`
echo "step 0. Install packages according to requirements.txt"
pip install -r requirements.txt

echo "step 0. Make `data` directory and move there."
mkdir data; cd data

echo "step 1. Download the stored wikipedia file to your disk."
wget "https://dumps.wikimedia.org/${lcode}wiki/20161201/${lcode}wiki-20161201-pages-articles-multistream.xml.bz2"
rm -rf ${lcode}wiki-${wiki_dump_version}-pages-articles-multistream*
wget "https://dumps.wikimedia.org/${lcode}wiki/${wiki_dump_version}/${lcode}wiki-${wiki_dump_version}-pages-articles-multistream.xml.bz2"

echo "step 2. Extract the bz2 file."
bzip2 -d "${lcode}wiki-20161201-pages-articles-multistream.xml.bz2"
bzip2 -d "${lcode}wiki-${wiki_dump_version}-pages-articles-multistream.xml.bz2"

cd ..
echo "step 3. Build Corpus."
python build_corpus.py --lcode=${lcode} --max_corpus_size=${max_corpus_size}
python build_corpus.py --lcode=${lcode} --max_corpus_size=${max_corpus_size} --wiki_dump_version=${wiki_dump_version}

echo "step 4. make wordvectors"
python make_wordvectors.py --lcode=${lcode} --vector_size=${vector_size} --window_size=${window_size} --vocab_size=${vocab_size} --num_negative=${num_negative}
28 changes: 28 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
boto==2.48.0
boto3==1.5.20
botocore==1.8.34
bz2file==0.98
certifi==2018.1.18
chardet==3.0.4
docutils==0.14
gensim==3.2.0
idna==2.6
jmespath==0.9.3
konlpy==0.4.4
lxml==4.1.1
nltk==3.2.5
numpy==1.14.0
python-crfsuite==0.9.5
python-dateutil==2.6.1
pyvi==0.0.8.0
regex==2018.1.10
requests==2.18.4
s3transfer==0.1.12
scikit-learn==0.19.1
scipy==1.0.0
six==1.11.0
sklearn-crfsuite==0.3.6
smart-open==1.5.6
tabulate==0.8.2
tqdm==4.19.5
urllib3==1.22

0 comments on commit 181f0a8

Please sign in to comment.