Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make it compatible with python 3 #13

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,99 @@ data/
*.pyc
_*

# Created by .ignore support plugin (hsz.mobi)
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# IPython Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# dotenv
.env

# virtualenv
venv/
ENV/
envbots/

# Spyder project settings
.spyderproject

# Rope project settings
.ropeproject

#IDE
.idea
__pycache__
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ This project has two purposes. First of all, I'd like to share some of my experi
* Go get various English word vectors [here](https://github.com/3Top/word2vec-api) if needed.

## Work Flow
* STEP 1. Download the [wikipedia database backup dumps](https://dumps.wikimedia.org/backup-index.html) of the language you want.
* STEP 1-1. Download the [wikipedia database backup dumps](https://dumps.wikimedia.org/backup-index.html) of the language you want.
* STEP 1-2. Install requirements packages
* STEP 2. Extract running texts to `data/` folder.
* STEP 3. Run `build_corpus.py`.
* STEP 4-1. Run `make_wordvector.sh` to get Word2Vec word vectors.
Expand Down
22 changes: 12 additions & 10 deletions build_corpus.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# coding: utf-8
#!/usr/bin/python2
import argparse
import codecs
import lxml.etree as ET
Expand All @@ -10,37 +9,40 @@
parser = argparse.ArgumentParser()
parser.add_argument('--lcode', help='ISO 639-1 code of target language. See `lcodes.txt`.')
parser.add_argument('--max_corpus_size', type=int, default=1000000000, help='the maximum size of the corpus. Feel free to adjust it according to your computing power.')
parser.add_argument('--wiki_dump_version', help='version of wikimedia dumps')

args = parser.parse_args()

lcode = args.lcode
wiki_dump_version = args.wiki_dump_version
if lcode == 'ko':
from konlpy.tag import Kkma # pip install konlpy. See http://konlpy.org/en/v0.4.4/ for further information.
kkma = Kkma()
print "kkma succesfuly loaded!"
print("kkma succesfuly loaded!")
elif lcode == 'ja':
import MeCab # See https://pypi.python.org/pypi/mecab-python/0.996
mecab = MeCab.Tagger("-Owakati")
print "mecab succesfuly loaded!"
print("mecab succesfuly loaded!")
elif lcode == 'zh':
import jieba # See https://pypi.python.org/pypi/jieba/
print "jieba succesfuly loaded!"
print("jieba succesfuly loaded!")
elif lcode == 'vi':
from pyvi.pyvi import ViTokenizer # See https://pypi.python.org/pypi/pyvi
print "pyvi succesfuly loaded!"
print("pyvi succesfuly loaded!")
elif lcode == 'th':
import pythai # See https://pypi.python.org/pypi/pythai
print "pythai succesfuly loaded!"
print("pythai succesfuly loaded!")
# elif lcode == 'ar':
# os.environ['CLASSPATH'] = "../stanford-segmenter-2015-12-09"
# from nltk.tokenize.stanford_segmenter import StanfordSegmenter
# segmenter = StanfordSegmenter(path_to_jar="../stanford-segmenter-2015-12-09/stanford-segmenter-3.6.0.jar",
# path_to_sihan_corpora_dict="../stanford-segmenter-2015-12-09/data",
# path_to_model="../stanford-segmenter-2015-12-09/data/pku.gz",
# path_to_dict="../stanford-segmenter-2015-12-09/data/dict-chris6.ser.gz")
# print "StanfordSegmenter succesfuly loaded!"
# print ("StanfordSegmenter succesfuly loaded!")

max_corpus_size = args.max_corpus_size
fname = "{}wiki-20161201-pages-articles-multistream.xml".format(lcode)
fname = "{}wiki-{}-pages-articles-multistream.xml".format(lcode, wiki_dump_version)

def clean_text(text):
global lcode
Expand Down Expand Up @@ -157,7 +159,7 @@ def build_corpus():
continue # it's okay as we have a pretty big corpus!
elem.clear() # We need to save memory!
if i % 1000 == 0:
print i,
print(i),
fsize = os.path.getsize("data/{}.txt".format(lcode))
if fsize > max_corpus_size:
break
Expand All @@ -166,4 +168,4 @@ def build_corpus():
if __name__ == "__main__":
build_corpus()

print "Done"
print("Done")
17 changes: 10 additions & 7 deletions make_wordvectors.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
# coding: utf-8
#!/usr/bin/python2
import nltk
import os
import codecs
import argparse
import numpy as np
import sys

# arguments setting
parser = argparse.ArgumentParser()
Expand Down Expand Up @@ -40,9 +39,13 @@ def get_min_count(sents):
def make_wordvectors():
global lcode
import gensim # In case you have difficulties installing gensim, you need to consider installing conda.
import cPickle as pickle

if sys.version_info[0] >= 3:
import pickle
else:
import cPickle as pickle

print "Making sentences as list..."
print("Making sentences as list...")
sents = []
with codecs.open('data/{}.txt'.format(lcode), 'r', 'utf-8') as fin:
while 1:
Expand All @@ -52,7 +55,7 @@ def make_wordvectors():
words = line.split()
sents.append(words)

print "Making word vectors..."
print("Making word vectors...")
min_count = get_min_count(sents)
model = gensim.models.Word2Vec(sents, size=vector_size, min_count=min_count,
negative=num_negative,
Expand All @@ -62,11 +65,11 @@ def make_wordvectors():

# Save to file
with codecs.open('data/{}.tsv'.format(lcode), 'w', 'utf-8') as fout:
for i, word in enumerate(model.index2word):
for i, word in enumerate(model.wv.index2word):
fout.write(u"{}\t{}\t{}\n".format(str(i), word.encode('utf8').decode('utf8'),
np.array_str(model[word])
))
if __name__ == "__main__":
make_wordvectors()

print "Done"
print("Done")
17 changes: 11 additions & 6 deletions make_wordvectors.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,31 @@

#### Set your hyper-parameters here ####
############## START ###################
lcode="xx" # ISO 639-1 code of target language. See `lcodes.txt`.
max_corpus_size=1000000000 # the maximum size of the corpus. Feel free to adjust it according to your computing power.
lcode="id" # ISO 639-1 code of target language. See `lcodes.txt`.
wiki_dump_version="20180120" # version of wikimedia dumps
max_corpus_size=10000000000 # the maximum size of the corpus. Feel free to adjust it according to your computing power.
vector_size=300 # the size of a word vector
window_size=5 # the maximum distance between the current and predicted word within a sentence.
vocab_size=20000 # the maximum vocabulary size
num_negative=5 # the int for negative specifies how many “noise words” should be drawn
############## END #####################

echo "step 0. Make `data` directory and move there.`
echo "step 0. Install packages according to requirements.txt"
pip install -r requirements.txt

echo "step 0. Make `data` directory and move there."
mkdir data; cd data

echo "step 1. Download the stored wikipedia file to your disk."
wget "https://dumps.wikimedia.org/${lcode}wiki/20161201/${lcode}wiki-20161201-pages-articles-multistream.xml.bz2"
rm -rf ${lcode}wiki-${wiki_dump_version}-pages-articles-multistream*
wget "https://dumps.wikimedia.org/${lcode}wiki/${wiki_dump_version}/${lcode}wiki-${wiki_dump_version}-pages-articles-multistream.xml.bz2"

echo "step 2. Extract the bz2 file."
bzip2 -d "${lcode}wiki-20161201-pages-articles-multistream.xml.bz2"
bzip2 -d "${lcode}wiki-${wiki_dump_version}-pages-articles-multistream.xml.bz2"

cd ..
echo "step 3. Build Corpus."
python build_corpus.py --lcode=${lcode} --max_corpus_size=${max_corpus_size}
python build_corpus.py --lcode=${lcode} --max_corpus_size=${max_corpus_size} --wiki_dump_version=${wiki_dump_version}

echo "step 4. make wordvectors"
python make_wordvectors.py --lcode=${lcode} --vector_size=${vector_size} --window_size=${window_size} --vocab_size=${vocab_size} --num_negative=${num_negative}
Expand Down
28 changes: 28 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
boto==2.48.0
boto3==1.5.20
botocore==1.8.34
bz2file==0.98
certifi==2018.1.18
chardet==3.0.4
docutils==0.14
gensim==3.2.0
idna==2.6
jmespath==0.9.3
konlpy==0.4.4
lxml==4.1.1
nltk==3.2.5
numpy==1.14.0
python-crfsuite==0.9.5
python-dateutil==2.6.1
pyvi==0.0.8.0
regex==2018.1.10
requests==2.18.4
s3transfer==0.1.12
scikit-learn==0.19.1
scipy==1.0.0
six==1.11.0
sklearn-crfsuite==0.3.6
smart-open==1.5.6
tabulate==0.8.2
tqdm==4.19.5
urllib3==1.22