diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..90cb566 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018-2019 Mu Yang + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..0cf67ed --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,3 @@ +include about.pyx +include ckipws/*.* +include ckipparser/*.* diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..5b37b50 --- /dev/null +++ b/Makefile @@ -0,0 +1,23 @@ +CC = gcc +PY = python +RM = rm -rf + +.PHONY: all build build_ext dist bdist bdist_wheel sdist upload clean run + +all: build + +dist: sdist + +build: build_ext + +bdist: bdist_wheel + +build_ext bdist_wheel sdist: + $(PY) setup.py $@ + +upload: dist + twine upload --repository-url https://test.pypi.org/legacy/ dist/*.tar.gz --verbose + +clean: + $(PY) setup.py clean -a + $(RM) build dist pyckip.egg-info diff --git a/README.md b/README.md deleted file mode 100644 index 6dbb6e2..0000000 --- a/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# pyckip -CKIP NLP Wrappers using Cython (Word Segmentation and Parser) diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..6b0d295 --- /dev/null +++ b/README.rst @@ -0,0 +1,119 @@ +PyCkip +======== + +CKIP NLP Wrappers (Word Segmentation and Parser) + +Introduction +------------ + +Git +^^^ + +https://github.com/emfomy/pyckip + +|Github Release| |Github License| |Github Forks| |Github Stars| |Github Watchers| + +.. |Github Release| image:: https://img.shields.io/github/release/emfomy/pyckip/all.svg?maxAge=3600 + :target: https://github.com/emfomy/pyckip/releases + +.. |Github License| image:: https://img.shields.io/github/license/emfomy/pyckip.svg?maxAge=3600 + +.. |Github Downloads| image:: https://img.shields.io/github/downloads/emfomy/pyckip/total.svg?maxAge=3600 + :target: https://github.com/emfomy/pyckip/releases/latest + +.. |Github Forks| image:: https://img.shields.io/github/forks/emfomy/pyckip.svg?style=social&label=Fork&maxAge=3600 + +.. |Github Stars| image:: https://img.shields.io/github/stars/emfomy/pyckip.svg?style=social&label=Star&maxAge=3600 + +.. |Github Watchers| image:: https://img.shields.io/github/watchers/emfomy/pyckip.svg?style=social&label=Watch&maxAge=3600 + +PyPI +^^^^ + +https://pypi.org/project/pyckip + +|Pypi Version| |Pypi License| |Pypi Format| |Pypi Python| |Pypi Implementation| |Pypi Status| + +.. |Pypi Version| image:: https://img.shields.io/pypi/v/pyckip.svg?maxAge=3600 + :target: https://pypi.org/project/pyckip + +.. |Pypi License| image:: https://img.shields.io/pypi/l/pyckip.svg?maxAge=3600 + +.. |Pypi Format| image:: https://img.shields.io/pypi/format/pyckip.svg?maxAge=3600 + +.. |Pypi Python| image:: https://img.shields.io/pypi/pyversions/pyckip.svg?maxAge=3600 + +.. |Pypi Implementation| image:: https://img.shields.io/pypi/implementation/pyckip.svg?maxAge=3600 + +.. |Pypi Status| image:: https://img.shields.io/pypi/status/pyckip.svg?maxAge=3600 + +Author +^^^^^^ + +* Mu Yang + +Requirements +^^^^^^^^^^^^ + +* `Python `_ 2.7+, 3.5+ +* `Cython `_ 0.29+ +* `Boost C++ Libraries `_ 1.54.0 +* CKIP Word Segmentation Linux version +* CKIP Parser Linux version + +Installation +^^^^^^^^^^^^ + +Step 1: Setup CKIPWS environment +"""""""""""""""""""""""""""""""" + +Denote ```` as the root path of CKIPWS Linux Version. Add below command to ``~/.bashrc`` + +.. code-block:: bash + + export LD_LIBRARY_PATH=/lib:$LD_LIBRARY_PATH + export CKIPWS_DATA2=/Data2 + +Step 2: Setup CKIP-Parser environment +""""""""""""""""""""""""""""""""""""" + +Denote ```` as the root path of CKIP-Parser Linux Version. Add below command to ``~/.bashrc`` + +.. code-block:: bash + + export LD_LIBRARY_PATH=/lib:$LD_LIBRARY_PATH + export CKIPPARSER_RULE=/Rule + export CKIPPARSER_RDB=/RDB + +Step 3: Install Using Pip +""""""""""""""""""""""""" + +.. code-block:: bash + + LIBRARY_PATH=/lib:/lib:$LIBRARY_PATH pip install pyckip + +FAQ +--- + +* I don't have CKIPWS/CKIP-Parser. What should I do? + +Append :code:`--install-option='--no-ws'` or :code:`--install-option='--no-parser'` after the :code:`pip install` command to disable CKIPWS or CKIP-Parser. + +.. code-block:: bash + + # Disable CKIPWS support + pip install pyckip --install-option='--no-ws' + + # Disable CKIP-Parser support + pip install pyckip --install-option='--no-parser' + +* The CKIPWS throws "``what(): locale::facet::_S_create_c_locale name not valid``". What should I do? + +.. code-block:: bash + + apt-get install locales-all + +License +------- + +* `MIT License `_ diff --git a/about.pyx b/about.pyx new file mode 100644 index 0000000..83d43a0 --- /dev/null +++ b/about.pyx @@ -0,0 +1,14 @@ +# -*- coding:utf-8 -*- +# cython: language_level=3 + +__author_name__ = 'Mu Yang' +__author_email__ = 'emfomy@gmail.com' +__copyright__ = 'Copyright 2018-2019' + +__title__ = 'pyckip' +__version__ = '0.3.0' +__description__ = 'CKIP NLP Wrappers' +__license__ = "MIT" + +__url__ = 'https://github.com/emfomy/pyckip' +__download_url__ = __url__+'/tarball/'+__version__ diff --git a/ckipparser/__init__.py b/ckipparser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ckipparser/cckipparser.pxd b/ckipparser/cckipparser.pxd new file mode 100644 index 0000000..9f62283 --- /dev/null +++ b/ckipparser/cckipparser.pxd @@ -0,0 +1,18 @@ +# -*- coding:utf-8 -*- + +__author__ = 'Mu Yang ' +__copyright__ = 'Copyright 2018-2019' + +cdef extern: + + ctypedef void* corenlp_t + + corenlp_t CKIPCoreNLP_New() + int CKIPCoreNLP_InitData(corenlp_t obj, char *FileName); + int CKIPCoreNLP_ApplyFile(corenlp_t obj, char *input, char *output); + int CKIPCoreNLP_ApplyList(corenlp_t obj, int length, const Py_UNICODE **inputList); + int CKIPCoreNLP_Parse(corenlp_t obj, const Py_UNICODE* pwsText, Py_UNICODE** ppwsResult); + int CKIPCoreNLP_ParseFile(corenlp_t obj, char *input, char *output); + const Py_UNICODE* CKIPCoreNLP_GetResultBegin(corenlp_t obj); + const Py_UNICODE* CKIPCoreNLP_GetResultNext(corenlp_t obj); + void CKIPCoreNLP_Destroy(corenlp_t obj); diff --git a/ckipparser/ckipparser.pyx b/ckipparser/ckipparser.pyx new file mode 100644 index 0000000..e517f87 --- /dev/null +++ b/ckipparser/ckipparser.pyx @@ -0,0 +1,230 @@ +# -*- coding:utf-8 -*- +# cython: language_level=3 + +from __future__ import print_function + +__author__ = 'Mu Yang ' +__copyright__ = 'Copyright 2018-2019' +include '../about.pyx' + +cimport ckipparser.cckipparser as cckipparser +from libc.stdlib cimport malloc, free +from cpython.unicode cimport PyUnicode_AsUnicode, PyUnicode_FromUnicode + +import datetime as __datetime +import os as __os +import re as __re +import sys as __sys +import tempfile as __tempfile + +from ckipws import CkipWS as __CkipWS + +def __to_bytes(text): + return text.encode() if __sys.version_info >= (3, 0) else text + +def __from_bytes(text): + return text.decode() if __sys.version_info >= (3, 0) else text + +def __to_unicode(text): + return text if __sys.version_info >= (3, 0) else text.decode('utf-8') + +def __from_unicode(text): + return text if __sys.version_info >= (3, 0) else text.encode('utf-8') + +cdef class CkipParser: + """The CKIP parser driver. + + Args: + logger (bool): enable logger. + inifile (str): the INI file. + wsinifile (str): the INI file for CKIPWS. + options: the optiones (see :func:`create_ini`). + """ + + cdef cckipparser.corenlp_t __obj + + def __cinit__(self, *, logger=False, inifile=None, wsinifile=None, **options): + + self.__obj = cckipparser.CKIPCoreNLP_New() + if self.__obj is NULL: + raise MemoryError() + + if logger: + self.enable_logger() + + if not wsinifile: + fwsini = __tempfile.NamedTemporaryFile(mode='w') + wsinifile = fwsini.name + wsinidata, options = __CkipWS.create_ini(**options) + fwsini.write(__from_unicode(wsinidata)) + fwsini.flush() + + if not inifile: + fini = __tempfile.NamedTemporaryFile(mode='w') + inifile = fini.name + inidata, options = self.create_ini(wsinifile=wsinifile, **options) + fini.write(__from_unicode(inidata)) + fini.flush() + + def CkipParser(*, _=None): return None + CkipParser(**options) + + name = __to_bytes(inifile) + ret = cckipparser.CKIPCoreNLP_InitData(self.__obj, name) + if not ret: + raise IOError() + + try: + fwsini.close() + except: + pass + + try: + fini.close() + except: + pass + + def __dealloc__(self): + if self.__obj is not NULL: + cckipparser.CKIPCoreNLP_Destroy(self.__obj) + pass + + def enable_logger(self): + """Enable logger.""" + # cckipparser.CKIPCoreNLP_EnableConsoleLogger(self.__obj) + pass + + def __call__(self, text, unicode=False): + """Parse a sentence. + + Args: + text (str): the input sentence. + unicode (bool): use Unicode for of input/output encoding; otherwise use system encoding. + + Return: + str: the output sentence. + """ + return self.apply_list([text], unicode=unicode)[0] + + def apply_list(self, ilist, unicode=False): + """Parse a list of sentence. + + Args: + ilist (list): the list of input sentences (str). + unicode (bool): use Unicode for of input/output encoding; otherwise use system encoding. + + Return: + list: the list of output sentences (str). + """ + + inum = len(ilist) + if not unicode: + ilist = [__to_unicode(l) for l in ilist] + + iarr = malloc(sizeof(const Py_UNICODE*) * inum) + for i in range(inum): + iarr[i] = PyUnicode_AsUnicode(ilist[i]) + ret = cckipparser.CKIPCoreNLP_ApplyList(self.__obj, inum, iarr) + free(iarr) + assert ret is not None + + cdef const Py_UNICODE* result + olist = [] + result = cckipparser.CKIPCoreNLP_GetResultBegin(self.__obj) + while result is not NULL: + olist.append(PyUnicode_FromUnicode(result, len(result)).strip()) + result = cckipparser.CKIPCoreNLP_GetResultNext(self.__obj) + + if not unicode: + olist = [__from_unicode(l) for l in olist] + + return olist + + def apply_file(self, ifile=None, ofile=None): + """Parse a file. + + Args: + ifile (str): the input file. + ofile (str): the output file (will be overwritten). + """ + assert ifile is not None + assert ofile is not None + ifile = __to_bytes(ifile) + ofile = __to_bytes(ofile) + + ret = cckipparser.CKIPCoreNLP_ApplyFile(self.__obj, ifile, ofile) + assert ret is not None + + @staticmethod + def create_ini(*, wsinifile=None, ruledir=None, rdbdir=None, \ + do_ws=True, do_parse=True, do_role=True, **options): + """Generate config. + + Args: + ruledir (str): the path to "Rule/". + rdbdir (str): the path to "RDB/". + + do_ws (bool): do word-segmentation. + do_parse (bool): do parsing. + do_role (bool): do role. + """ + assert wsinifile is not None + + if ruledir is None: + ruledir = __os.getenv('CKIPPARSER_RULE') + if not ruledir: + print('Warning: $CKIPPARSER_RULE is unset or null') + + if rdbdir is None: + rdbdir = __os.getenv('CKIPPARSER_RDB') + if not rdbdir: + print('Warning: $CKIPPARSER_RDB is unset or null') + + IsTag = not do_ws + AssignRole = do_role + AssignRoleOnly = False + + if not do_parse: + if not do_ws and not do_role: + raise ValueError('Must select at least one task') + if do_ws and not do_role: + raise ValueError('Use ckipws.CkipWS for word-segmentation') + if not do_ws and do_role: + AssignRoleOnly = True + if do_ws and do_role: + raise ValueError('Invalid tasks') + + cfg = [] + + cfg.append('[WordSeg]') + cfg.append('ini={wsinifile}'.format(wsinifile=wsinifile)) + cfg.append('') + + cfg.append('[Parser]') + cfg.append('SetPos13=0') + cfg.append('13CateFile={ruledir}/13Cate.txt'.format(ruledir=ruledir)) + cfg.append('') + + # cfg.append('SetMap=1') + cfg.append('SetMap=0') + cfg.append('CatMapFile={ruledir}/CatMap.txt'.format(ruledir=ruledir)) + cfg.append('') + + cfg.append('GrammarRule={ruledir}/CKIP-Rule.txt'.format(ruledir=ruledir)) + cfg.append('HeadRule={ruledir}/CKIP-Head.txt'.format(ruledir=ruledir)) + cfg.append('') + + cfg.append('SetChangePos=1') + cfg.append('SentenceDelimiter=,,;。!?') + cfg.append('SetLength=15') + cfg.append('NormalPos=1') + cfg.append('NormalTree=1') + cfg.append('IsTag={IsTag}'.format(IsTag=int(IsTag))) + cfg.append('') + + cfg.append('[SRL]') + cfg.append('DataPath={rdbdir}/'.format(rdbdir=rdbdir)) + cfg.append('AssignRole={AssignRole}'.format(AssignRole=int(AssignRole))) + cfg.append('AssignRoleOnly={AssignRoleOnly}'.format(AssignRoleOnly=int(AssignRoleOnly))) + + return '\n'.join(cfg), options diff --git a/ckipws/__init__.py b/ckipws/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ckipws/cckipws.pxd b/ckipws/cckipws.pxd new file mode 100644 index 0000000..88b6b7a --- /dev/null +++ b/ckipws/cckipws.pxd @@ -0,0 +1,20 @@ +# -*- coding:utf-8 -*- + +__author__ = 'Mu Yang ' +__copyright__ = 'Copyright 2018-2019' + +cdef extern: + + ctypedef void* wordseg_t + + wordseg_t WordSeg_New() + int WordSeg_InitData(wordseg_t obj, char *FileName); + int WordSeg_ApplyFile(wordseg_t obj, char *input, char *output, char *uwfile); + int WordSeg_ApplyList(wordseg_t obj, int length, const Py_UNICODE **inputList); + int WordSeg_ApplyArticle(wordseg_t obj, int length, const Py_UNICODE **inputList); + const Py_UNICODE* WordSeg_GetResultBegin(wordseg_t obj); + const Py_UNICODE* WordSeg_GetResultNext(wordseg_t obj); + const Py_UNICODE* WordSeg_GetUWBegin(wordseg_t obj); + const Py_UNICODE* WordSeg_GetUWNext(wordseg_t obj); + void WordSeg_EnableConsoleLogger(wordseg_t obj); + void WordSeg_Destroy(wordseg_t obj); diff --git a/ckipws/ckipws.pyx b/ckipws/ckipws.pyx new file mode 100644 index 0000000..9bea8c9 --- /dev/null +++ b/ckipws/ckipws.pyx @@ -0,0 +1,413 @@ +# -*- coding:utf-8 -*- +# cython: language_level=3 + +from __future__ import print_function + +__author__ = 'Mu Yang ' +__copyright__ = 'Copyright 2018-2019' +include '../about.pyx' + +cimport ckipws.cckipws as cckipws +from libc.stdlib cimport malloc, free +from cpython.unicode cimport PyUnicode_AsUnicode, PyUnicode_FromUnicode + +import datetime as __datetime +import os as __os +import re as __re +import sys as __sys +import tempfile as __tempfile + +def __to_bytes(text): + return text.encode() if __sys.version_info >= (3, 0) else text + +def __from_bytes(text): + return text.decode() if __sys.version_info >= (3, 0) else text + +def __to_unicode(text): + return text if __sys.version_info >= (3, 0) else text.decode('utf-8') + +def __from_unicode(text): + return text if __sys.version_info >= (3, 0) else text.encode('utf-8') + +cdef class CkipWS: + """The CKIP word segmentation driver. + + Args: + logger (bool): enable logger. + inifile (str): the INI file. + options: the optiones (see :func:`create_ini`). + """ + + cdef cckipws.wordseg_t __obj + + def __cinit__(self, *, logger=False, inifile=None, **options): + + self.__obj = cckipws.WordSeg_New() + if self.__obj is NULL: + raise MemoryError() + + if logger: + self.enable_logger() + + if not inifile: + fini = __tempfile.NamedTemporaryFile(mode='w') + inifile = fini.name + inidata, options = self.create_ini(**options) + fini.write(__from_unicode(inidata)) + fini.flush() + + def CkipWS(*, _=None): return None + CkipWS(**options) + + name = __to_bytes(inifile) + ret = cckipws.WordSeg_InitData(self.__obj, name) + if not ret: + raise IOError() + + try: + fini.close() + except: + pass + + def __dealloc__(self): + if self.__obj is not NULL: + cckipws.WordSeg_Destroy(self.__obj) + pass + + def enable_logger(self): + """Enable logger.""" + cckipws.WordSeg_EnableConsoleLogger(self.__obj) + + def __call__(self, text, unicode=False): + """Segment a sentence. + + Args: + text (str): the input sentence. + unicode (bool): use Unicode for of input/output encoding; otherwise use system encoding. + + Return: + str: the output sentence. + """ + return self.apply_list([text], unicode=unicode)[0] + + def apply_list(self, ilist, unicode=False): + """Segment a list of sentence. + + Args: + ilist (list): the list of input sentences (str). + unicode (bool): use Unicode for of input/output encoding; otherwise use system encoding. + + Return: + list: the list of output sentences (str). + """ + + inum = len(ilist) + if not unicode: + ilist = [__to_unicode(l) for l in ilist] + + iarr = malloc(sizeof(const Py_UNICODE*) * inum) + for i in range(inum): + iarr[i] = PyUnicode_AsUnicode(ilist[i]) + ret = cckipws.WordSeg_ApplyList(self.__obj, inum, iarr) + free(iarr) + assert ret is not None + + cdef const Py_UNICODE* result + olist = [] + result = cckipws.WordSeg_GetResultBegin(self.__obj) + while result is not NULL: + olist.append(PyUnicode_FromUnicode(result, len(result)).strip()) + result = cckipws.WordSeg_GetResultNext(self.__obj) + + if not unicode: + olist = [__from_unicode(l) for l in olist] + + return olist + + def apply_file(self, ifile=None, ofile=None, uwfile=''): + """Segment a file. + + Args: + ifile (str): the input file. + ofile (str): the output file (will be overwritten). + uwfile (str): the unknown word file (will be overwritten). + """ + assert ifile is not None + assert ofile is not None + ifile = __to_bytes(ifile) + ofile = __to_bytes(ofile) + uwfile = __to_bytes(uwfile) + + ret = cckipws.WordSeg_ApplyFile(self.__obj, ifile, ofile, uwfile) + assert ret is not None + + @staticmethod + def create_ini(*, data2dir=None, lexfile=None, NewStyleFormat=False, ShowCategory=True, **options): + """Generate config. + + Args: + data2dir (str): the path to "Data2/". + lexfile (str): the path to user-defined lexicon file. + + NewStyleFormat (bool): split sentences by newline characters ("\\n") rather than punctuations. + ShowCategory (bool): show part-of-speech tags. + """ + if data2dir is None: + data2dir = __os.getenv('CKIPWS_DATA2') + if not data2dir: + print('Warning: $CKIPWS_DATA2 is unset or null') + + cfg = [] + + cfg.append(';PyCkip {version}'.format(version=__version__)) + cfg.append(';ws.ini') + cfg.append(';Auto-generated {date}'.format(date=__datetime.datetime.now())) + cfg.append('') + + cfg.append('[ConsoleLogger]') + cfg.append('Name=ConsoleLogger') + cfg.append('') + + if lexfile: + cfg.append('[CTextLexicon]') + cfg.append('Name=TextLex') + cfg.append('FileName={lexfile}'.format(lexfile=lexfile)) + cfg.append('') + + cfg.append('[CLexicon]') + cfg.append('Name=Lex') + cfg.append('FileName={data2dir}/Lexicon.Dat'.format(data2dir=data2dir)) + cfg.append('') + + cfg.append('[CALexicon]') + cfg.append('Name=CALex') + cfg.append('FileName={data2dir}/CALexicon.Dat'.format(data2dir=data2dir)) + cfg.append('') + + cfg.append('[CDMMergedParser]') + cfg.append('Name=DMMergedParser') + cfg.append('GenerateMaxLengthWordOnly=no') + cfg.append('LexiconName=Lex') + cfg.append('') + + cfg.append('[CDMSplittedParser]') + cfg.append('Name=DMSplittedParser') + cfg.append('GenerateMaxLengthWordOnly=no') + cfg.append('LexiconName=Lex') + cfg.append('') + + cfg.append('[CHTRDRule3]') + cfg.append('Name=RD3') + cfg.append('LexiconName=Lex') + cfg.append('') + + cfg.append('[CHTRDRule6]') + cfg.append('Name=RD6') + cfg.append('LexiconName=Lex') + cfg.append('') + + cfg.append('[CHTRDRule7]') + cfg.append('Name=RD7') + cfg.append('LexiconName=Lex') + cfg.append('') + + cfg.append('[CHTForeignWord]') + cfg.append('Name=FW') + cfg.append('LexiconName=Lex') + cfg.append('') + + cfg.append('[CHTBoundWord]') + cfg.append('Name=BW') + cfg.append('LexiconName=Lex') + cfg.append('') + + cfg.append('[CMaxMatch]') + cfg.append('Name=MaxMatch') + cfg.append('WindowSize=3') + cfg.append('') + + cfg.append('[CHTCategoryPredictor]') + cfg.append('Name=CatPred') + cfg.append('PrefixCategoryFileName={data2dir}/CatPredictData/PrefixCategoryFreq'.format(data2dir=data2dir)) + cfg.append('PrefixFileName={data2dir}/CatPredictData/PrefixFreq'.format(data2dir=data2dir)) + cfg.append('SuffixCategoryFileName={data2dir}/CatPredictData/SuffixCategoryFreq'.format(data2dir=data2dir)) + cfg.append('SuffixFileName={data2dir}/CatPredictData/SuffixFreq'.format(data2dir=data2dir)) + cfg.append('') + + cfg.append('[CStatProb]') + cfg.append('Name=CAProb1') + cfg.append('FileName={data2dir}/CAStat-w(0)c(0)-w(-1).dat'.format(data2dir=data2dir)) + cfg.append('') + + cfg.append('[CStatProb]') + cfg.append('Name=CAProb2') + cfg.append('FileName={data2dir}/CAStat-w(0)c(0)-w(1).dat'.format(data2dir=data2dir)) + cfg.append('') + + cfg.append('[CStatProb]') + cfg.append('Name=CAProb3') + cfg.append('FileName={data2dir}/CAStat-w(0)c(0)-w(-2).dat'.format(data2dir=data2dir)) + cfg.append('') + + cfg.append('[CStatProb]') + cfg.append('Name=CAProb4') + cfg.append('FileName={data2dir}/CAStat-w(0)c(0)-w(2).dat'.format(data2dir=data2dir)) + cfg.append('') + + cfg.append('[CoveringAmbiguity]') + cfg.append('Name=CA') + cfg.append('LexiconName=Lex') + cfg.append('CoveringAmbiguityLexiconName=CALex') + cfg.append('InsertSplittedWordsOnly=false') + cfg.append('StatisticProbability1=CAProb1') + cfg.append('StatisticProbability2=CAProb2') + cfg.append('StatisticProbability3=CAProb3') + cfg.append('StatisticProbability4=CAProb4') + cfg.append('') + + cfg.append('[CStatProb]') + cfg.append('Name=Prob1') + cfg.append('FileName={data2dir}/CKIPWStatistic-w(-1)-w(0).dat'.format(data2dir=data2dir)) + cfg.append('') + + cfg.append('[CStatProb]') + cfg.append('Name=Prob2') + cfg.append('FileName={data2dir}/CKIPWStatistic-c(-1)-c(0).dat'.format(data2dir=data2dir)) + cfg.append('') + + cfg.append('[CStatProb]') + cfg.append('Name=Prob3') + cfg.append('FileName={data2dir}/CKIPWStatistic-c(0)-w(0).dat'.format(data2dir=data2dir)) + cfg.append('') + + cfg.append('[CSimpleProbModel]') + cfg.append('Name=ProbModel') + cfg.append('StatisticProbability1=Prob1') + cfg.append('StatisticProbability2=Prob2') + cfg.append('StatisticProbability3=Prob3') + cfg.append('LexiconName=Lex') + if lexfile: print('TextLexiconName=TextLex') + cfg.append('AdjustProb3=true') + cfg.append('CoveringAmbiguityLexiconName=CALex') + cfg.append('CategoryPredictor=CatPred') + cfg.append('KeepBestCategory=true') + cfg.append('SimplifiedCategory=false') + cfg.append('') + + cfg.append('[CDetectMonosyllabicMorpheme]') + cfg.append('Name=DMM') + cfg.append('ApplyDefaultHeuristicDetectRule=yes') + cfg.append('InitDataPath={data2dir}/uwea/qrulepool/'.format(data2dir=data2dir)) + cfg.append('') + + cfg.append('[ChineseName]') + cfg.append('Name=CN') + cfg.append('LexiconName=Lex') + cfg.append('InitDataPath={data2dir}/uwea/data/'.format(data2dir=data2dir)) + cfg.append('') + + cfg.append('[CForeignName]') + cfg.append('Name=FN') + cfg.append('LexiconName=Lex') + if lexfile: print('TextLexiconName=TextLex') + cfg.append('InitDataPath={data2dir}/uwea/data/'.format(data2dir=data2dir)) + cfg.append('') + + cfg.append('[CompoundWord]') + cfg.append('Name=CW') + cfg.append('LexiconName=Lex') + cfg.append('InitDataPath={data2dir}/uwea/data/'.format(data2dir=data2dir)) + cfg.append('') + + cfg.append('[CStatisticWord]') + cfg.append('Name=SW') + cfg.append('LexiconName=Lex') + if lexfile: print('TextLexiconName=TextLex') + cfg.append('CategoryPredictor=CatPred') + cfg.append('InitDataPath={data2dir}/uwea/data/'.format(data2dir=data2dir)) + cfg.append('ApplyRule=639') + cfg.append('') + + cfg.append('[CAffixCombiner]') + cfg.append('Name=AC') + cfg.append('LexiconName=Lex') + if lexfile: print('TextLexiconName=TextLex') + cfg.append('CategoryPredictor=CatPred') + cfg.append('') + + cfg.append('[CSimilarStructureCombiner]') + cfg.append('Name=SSC') + cfg.append('AutoCombineWordLen=2') + cfg.append('HeuristicCombinedWordMaxLen=3') + cfg.append('LexiconName=Lex') + cfg.append('CategoryPredictor=CatPred') + cfg.append('') + + cfg.append('[COnlineLexicon]') + cfg.append('Name=OnlineLexForUWGen') + cfg.append('') + + cfg.append('[CUnknownWord]') + cfg.append('Name=UW') + cfg.append('UnknownWordGeneratorList=DMM CN FN CW SW SSC') + cfg.append('OnlineLexicon=OnlineLexForUWGen') + cfg.append('') + + if lexfile: + cfg.append('[CLexWordGenerator]') + cfg.append('Name=myLWGen') + cfg.append('LexiconName=TextLex') + cfg.append('') + + cfg.append('[CLexWordGenerator]') + cfg.append('Name=LWGen') + cfg.append('LexiconName=Lex') + cfg.append('') + + cfg.append('[CLexWordGenerator]') + cfg.append('Name=LWGen1') + cfg.append('LexiconName=Lex') + cfg.append('MaxWordLen=1') + cfg.append('') + + cfg.append('[CLexWordGenerator]') + cfg.append('Name=UWGen') + cfg.append('LexiconName=OnlineLexForUWGen') + cfg.append('') + + cfg.append('[CSimpleProbModelResult]') + cfg.append('Name=ProbModelResult') + cfg.append('ProbabilityModelName=ProbModel') + cfg.append('NewStyleFormat={NewStyleFormat}'.format(NewStyleFormat=str(NewStyleFormat).lower())) + cfg.append('ShowCategory={ShowCategory}'.format(ShowCategory=str(ShowCategory).lower())) + cfg.append('LexiconName=Lex') + cfg.append('CategoryPredictor=CatPred') + cfg.append('KeepExistingWord=true') + cfg.append('FeatureAssigner=FA') + cfg.append('FilterBadWord=false') + cfg.append('') + + cfg.append('[CDetectDMForPostProcess]') + cfg.append('Name=DDMFPP') + cfg.append('') + + cfg.append('[CRemoveWordToBePostProcessed]') + cfg.append('Name=RWTBPP') + cfg.append('') + + HandlerList = ['LWGen', 'myLWGen', 'DMMergedParser', 'RD3', 'RD6', 'RD7', 'FW', 'BW', 'MaxMatch', 'ProbModel', 'UW', 'DDMFPP', 'LWGen', 'UWGen', 'RWTBPP', 'LWGen', 'myLWGen', 'DMSplittedParser', 'BW', 'MaxMatch', 'ProbModel', 'CA'] + if not lexfile: + while 'myLWGen' in HandlerList: + HandlerList.remove('myLWGen') + + cfg.append('[CWordSegmentor]') + cfg.append('Name=MainWS') + cfg.append('ArticleMaxLineNum=300') + cfg.append('SentenceMaxWordNum=80') + cfg.append('ReloadMyDic=false') + cfg.append('SentenceDelimiter=,,;。!?') + cfg.append('HandlerList={HandlerList}'.format(HandlerList=' '.join(HandlerList))) + cfg.append('Result=ProbModelResult') + cfg.append('') + + return '\n'.join(cfg), options diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..11e9ec4 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[metadata] +description-file = README.rst \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..6eda426 --- /dev/null +++ b/setup.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- + +__author__ = 'Mu Yang ' +__copyright__ = 'Copyright 2018-2019' + +from setuptools import dist +dist.Distribution().fetch_build_eggs([ + 'Cython>=0.29', +]) + +import sys +from setuptools import setup, find_packages +from setuptools.extension import Extension +from setuptools.command.install import install +from Cython.Build import cythonize + +import pyximport; pyximport.install() +import about + +with open('README.rst') as fin: + readme = fin.read() + +class InstallCommand(install): + + user_options = install.user_options + [ + ('no-ws', None, 'without CKIPWS'), + ('no-parser', None, 'without CKIP-Parser'), + ] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.__extensions = [] + + def initialize_options(self): + super().initialize_options() + self.no_ws = False + self.no_parser = False + + def finalize_options(self): + super().finalize_options() + + if self.no_ws: + print('Disable CKIPWS support!') + i = next((i for i, ext_module in enumerate(self.distribution.ext_modules) if ext_module.name == 'ckipws'), None) + if i is not None: del self.distribution.ext_modules[i] + + if self.no_parser: + print('Disable CKIP-Parser support!') + i = next((i for i, ext_module in enumerate(self.distribution.ext_modules) if ext_module.name == 'ckipparser'), None) + if i is not None: del self.distribution.ext_modules[i] + +setup( + name=about.__title__, + version=about.__version__, + author=about.__author_name__, + author_email=about.__author_email__, + maintainer=about.__author_name__, + maintainer_email=about.__author_email__, + description=about.__description__, + long_description=readme, + long_description_content_type='text/x-rst', + url=about.__url__, + download_url=about.__download_url__, + platforms=['linux_x86_64'], + license=about.__license__, + classifiers=[ + 'Development Status :: 4 - Beta', + 'Environment :: Console', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Cython', + 'License :: OSI Approved :: MIT License', + 'Operating System :: POSIX :: Linux', + 'Natural Language :: Chinese (Traditional)', + ], + ext_modules=cythonize( + [ + Extension('ckipws', + sources=['ckipws/ckipws.pyx'], + libraries=['WordSeg'], + ), + Extension('ckipparser', + sources=['ckipparser/ckipparser.pyx'], + libraries=['CKIPCoreNLP', 'CKIPParser', 'CKIPSRL', 'CKIPWS'], + ), + ], + build_dir='build', + ), + cmdclass={ + 'install': InstallCommand, + }, +)