diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000..76e1142
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,11 @@
+version: 2
+updates:
+ -
+ package-ecosystem: composer
+ directory: "/"
+ schedule:
+ interval: weekly
+ versioning-strategy: auto
+ groups:
+ dev-dependencies:
+ dependency-type: "development"
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
new file mode 100644
index 0000000..0fa594f
--- /dev/null
+++ b/.github/workflows/main.yml
@@ -0,0 +1,81 @@
+# GithHub Actions Workflow generated with Ghygen
+# Original configuration: https://ghygen.hi-folks.dev?code=0555902844da5dd5163a69e93327a0aa
+name: PHP NLP Tools
+on:
+ push:
+ branches:
+ - master
+ - main
+ - develop
+ pull_request:
+ branches:
+ - master
+ - main
+ - develop
+
+jobs:
+ tests:
+ runs-on: ubuntu-latest
+
+ strategy:
+ matrix:
+ operating-system: [ ubuntu-latest ]
+ php: [ '8.1', '8.2', '8.3' ]
+ dependency-stability: [ 'prefer-stable' ]
+
+ name: PHP ${{ matrix.php }} - ${{ matrix.dependency-stability }} - ${{ matrix.operating-system}}
+
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Install PHP versions
+ uses: shivammathur/setup-php@v2
+ with:
+ php-version: ${{ matrix.php }}
+ extensions: gd
+
+ - name: Get Composer Cache Directory
+ id: composer-cache
+ run: |
+ echo "dir=$(composer config cache-files-dir)" >> $GITHUB_OUTPUT
+
+ - name: Cache Composer dependencies
+ uses: actions/cache@v4
+ id: actions-cache
+ with:
+ path: ${{ steps.composer-cache.outputs.dir }}
+ key: ${{ runner.os }}-composer-${{ hashFiles('**/composer.lock') }}
+ restore-keys: |
+ ${{ runner.os }}-composer-
+
+ - name: Cache PHP dependencies (vendor)
+ uses: actions/cache@v4
+ id: vendor-cache
+ with:
+ path: vendor
+ key: ${{ runner.OS }}-build-${{ hashFiles('**/composer.lock') }}
+
+ - name: Install Dependencies
+ if: steps.vendor-cache.outputs.cache-hit != 'true'
+ run: |
+ composer update --${{ matrix.dependency-stability }} --prefer-dist --no-interaction --no-suggest
+
+ - name: Update Dependencies with latest stable
+ if: matrix.dependency-stability == 'prefer-stable'
+ run: composer update --prefer-stable
+
+ # Code quality
+ - name: Execute Code Sniffer
+ run: vendor/bin/phpcs
+
+ - name: Execute PHP Stan
+ run: vendor/bin/phpstan
+
+ - name: Execute Rector
+ run: vendor/bin/rector --dry-run
+
+ - name: Execute PHP Unit
+ run: vendor/bin/phpunit --exclude-group Slow
+
+
diff --git a/.gitignore b/.gitignore
index 0431448..eccccdf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
-vendor/
-/nbproject/private/
-nbproject
+/vendor/
+/composer.lock
+/.phpunit.result.cache
+
diff --git a/README.markdown b/README.markdown
index c4f0ce4..5440521 100644
--- a/README.markdown
+++ b/README.markdown
@@ -1,7 +1,7 @@
[PHP NlpTools](http://php-nlp-tools.com/)
=============
-NlpTools is a set of php 5.3+ classes for beginner to
+NlpTools is a set of php 8.1+ classes for beginner to
semi advanced natural language processing work.
Documentation
@@ -92,3 +92,29 @@ Lda is still experimental and quite slow but it works. [See an example](http://p
2. Stop words
3. Language based normalizers
4. Classifier based transformation for creating flexible preprocessing pipelines
+
+Testing information
+===================
+
+
+Writing Tests
+-------------
+
+* Test classes should be in the same namespace as the class that is being tested
+* Any data needed for the test or produced by the test should be in the 'data' directory
+ under the same folder as the namespace. Only data needed (not produced) are commited to
+ the repository.
+* Tests should be marked with the groups **Slow** and **VerySlow** if they require more than
+ 10 seconds and 1 minute respectively. If a test is marked as VerySlow it should also be marked
+ as Slow.
+* Both functional and unit tests are welcome.
+
+Executing Tests
+---------------
+
+Currently only one testsuite is defined (all tests). Because some tests take a long time to
+run you can try running `phpunit --exclude-group Slow` or `phpunit --exclude-group VerySlow`
+to avoid some slow tests.
+
+PHPUnit should be run from inside the tests folder or the phpunit.xml file should be provided
+as config.
diff --git a/composer.json b/composer.json
index 40dcb9d..9f0cc28 100644
--- a/composer.json
+++ b/composer.json
@@ -1,25 +1,27 @@
{
- "name": "nlp-tools/nlp-tools",
- "description": "NlpTools is a set of php 5.3+ classes for beginner to semi advanced natural language processing work.",
- "keywords": ["nlp","machine learning"],
- "license": "WTFPL",
- "authors": [
- {
- "name": "Angelos Katharopoulos",
- "email": "angelos@yourse.gr"
- }
- ],
- "require": {
- "php": ">=5.3"
- },
- "autoload": {
- "psr-0": {
- "NlpTools\\": "src/"
- }
- },
- "extra": {
- "branch-alias": {
- "dev-master": "1.0.x-dev"
- }
- }
+ "name": "nlp-tools/nlp-tools",
+ "description": "NlpTools is a set of php 8.1+ classes for beginner to semi advanced natural language processing work.",
+ "keywords": ["nlp","machine learning"],
+ "license": "WTFPL",
+ "authors": [
+ {
+ "name": "Angelos Katharopoulos",
+ "email": "angelos@yourse.gr"
+ }
+ ],
+ "require": {
+ "php": ">=8.1",
+ "ext-gd": "*"
+ },
+ "require-dev": {
+ "squizlabs/php_codesniffer": "^3.10",
+ "phpstan/phpstan": "^1.10",
+ "phpunit/phpunit": "^10.0 || ^11.0",
+ "rector/rector": "^1.0"
+ },
+ "autoload": {
+ "psr-0": {
+ "NlpTools\\": "src/"
+ }
+ }
}
diff --git a/phpcs.xml b/phpcs.xml
new file mode 100644
index 0000000..c1b0851
--- /dev/null
+++ b/phpcs.xml
@@ -0,0 +1,24 @@
+
+
+ The coding standard.
+
+
+
+ src
+ tests
+ */tests/sentiment_maxent.php
+
+
+
+
+
+
+
+
+ error
+
+
+
+
+
+
\ No newline at end of file
diff --git a/phpstan.neon b/phpstan.neon
new file mode 100644
index 0000000..3e9d63d
--- /dev/null
+++ b/phpstan.neon
@@ -0,0 +1,8 @@
+parameters:
+ paths:
+ - ./src
+ - ./tests
+ excludePaths:
+ - ./tests/sentiment_maxent.php
+ # The level 9 is the highest level (with check for mixed type)
+ level: 6
\ No newline at end of file
diff --git a/phpunit.xml b/phpunit.xml
new file mode 100644
index 0000000..b21bde5
--- /dev/null
+++ b/phpunit.xml
@@ -0,0 +1,5 @@
+
+
+ ./tests/NlpTools/
+
+
diff --git a/rector.php b/rector.php
new file mode 100644
index 0000000..f70b62e
--- /dev/null
+++ b/rector.php
@@ -0,0 +1,51 @@
+withPaths([
+ __DIR__.'/src',
+ __DIR__.'/tests',
+ ])
+ // uncomment to reach your current PHP version
+ ->withPhpSets()
+ ->withRules([
+ AddVoidReturnTypeWhereNoReturnRector::class,
+ ChangeConstantVisibilityRector::class,
+ RenameForeachValueVariableToMatchExprVariableRector::class,
+ ReturnTypeFromReturnNewRector::class,
+ CountArrayToEmptyArrayComparisonRector::class,
+ StrictArraySearchRector::class,
+ SymplifyQuoteEscapeRector::class,
+ DeclareStrictTypesRector::class,
+ ])
+ ->withSets([
+ PHPUnitSetList::PHPUNIT_110,
+ ])
+ ->withPhpSets()
+ ->withPHPStanConfigs(['phpstan.neon'])
+ ->withPreparedSets(
+ deadCode: true,
+ codeQuality: true,
+ codingStyle: true,
+ typeDeclarations: true,
+ privatization: true,
+ naming: true,
+ instanceOf: true,
+ earlyReturn: true,
+ strictBooleans: true
+ )
+ ->withSkip([
+ __DIR__ . '/tests/sentiment_maxent.php'
+ ]);
diff --git a/src/NlpTools/Analysis/FreqDist.php b/src/NlpTools/Analysis/FreqDist.php
index 9e479e5..a2a93b4 100644
--- a/src/NlpTools/Analysis/FreqDist.php
+++ b/src/NlpTools/Analysis/FreqDist.php
@@ -1,4 +1,7 @@
*/
- protected $keyValues = array();
+ protected array $keyValues = [];
/**
* The total number of tokens originally passed into FreqDist
- * @var int
*/
- protected $totalTokens = null;
+ protected int $totalTokens;
/**
* This sorts the token meta data collection right away so use
* frequency distribution data can be extracted.
- * @param array $tokens
+ *
+ * @param array $tokens
*/
public function __construct(array $tokens)
{
@@ -35,113 +38,110 @@ public function __construct(array $tokens)
/**
* Get the total number of tokens in this tokensDocument
- * @return int
*/
- public function getTotalTokens()
+ public function getTotalTokens(): int
{
return $this->totalTokens;
}
/**
* Internal function for summarizing all the data into a key value store
- * @param array $tokens The set of tokens passed into the constructor
+ *
+ * @param array $tokens
*/
- protected function preCompute(array &$tokens)
+ protected function preCompute(array &$tokens): void
{
- //count all the tokens up and put them in a key value store
+ // count all the tokens up and put them in a key value store
$this->keyValues = array_count_values($tokens);
arsort($this->keyValues);
}
/**
* Return the weight of a single token
- * @return float
*/
- public function getWeightPerToken()
+ public function getWeightPerToken(): float
{
return 1 / $this->getTotalTokens();
}
/**
* Return get the total number of unique tokens
- * @return int
*/
- public function getTotalUniqueTokens()
+ public function getTotalUniqueTokens(): int
{
return count($this->keyValues);
}
/**
* Return the sorted keys by frequency desc
- * @return array
+ *
+ * @return array
*/
- public function getKeys()
+ public function getKeys(): array
{
return array_keys($this->keyValues);
}
/**
* Return the sorted values by frequency desc
- * @return array
+ *
+ * @return array
*/
- public function getValues()
+ public function getValues(): array
{
return array_values($this->keyValues);
}
/**
* Return the full key value store
- * @return array
+ *
+ * @return array
*/
- public function getKeyValues()
+ public function getKeyValues(): array
{
return $this->keyValues;
}
/**
* Return a token's count
- * @param string $string
- * @return mixed
*/
- public function getTotalByToken($string)
+ public function getTotalByToken(string $string): float|false
{
$array = $this->keyValues;
- if(array_key_exists($string, $array)) {
+ if (array_key_exists($string, $array)) {
return $array[$string];
- } else {
- return false;
}
+
+ return false;
}
/**
* Return a token's weight (for user's own tf-idf/pdf/iduf implem)
- * @param string $string
- * @return mixed
*/
- public function getTokenWeight($string)
+ public function getTokenWeight(string $string): float|false
{
- if($this->getTotalByToken($string)){
- return $this->getTotalByToken($string)/$this->getTotalTokens();
- } else {
- return false;
+ if ($this->getTotalByToken($string)) {
+ return $this->getTotalByToken($string) / $this->getTotalTokens();
}
+
+ return false;
}
/**
- *
* Returns an array of tokens that occurred once
* @todo This is an inefficient approach
- * @return array
+ *
+ * @return array
*/
- public function getHapaxes()
+ public function getHapaxes(): array
{
- $samples = array();
+ $samples = [];
foreach ($this->getKeyValues() as $sample => $count) {
- if ($count == 1) {
+ if ((int) $count === 1) {
$samples[] = $sample;
}
}
+
return $samples;
}
-
}
diff --git a/src/NlpTools/Analysis/Idf.php b/src/NlpTools/Analysis/Idf.php
index 785e170..440a8c8 100644
--- a/src/NlpTools/Analysis/Idf.php
+++ b/src/NlpTools/Analysis/Idf.php
@@ -1,5 +1,7 @@
*/
class Idf implements \ArrayAccess
{
- protected $logD;
- protected $idf;
+ protected float $logD;
+
+ /**
+ * @var array
+ */
+ protected array $idf;
/**
- * @param TrainingSet $tset The set of documents for which we will compute the idf
- * @param FeatureFactoryInterface $ff A feature factory to translate the document data to single tokens
+ * @param TrainingSet $trainingSet The set of documents for which we will compute the idf
+ * @param FeatureFactoryInterface $featureFactory A feature factory to translate the document data to single tokens
*/
- public function __construct(TrainingSet $tset, FeatureFactoryInterface $ff=null)
+ public function __construct(TrainingSet $trainingSet, FeatureFactoryInterface $featureFactory = null)
{
- if ($ff===null)
- $ff = new DataAsFeatures();
+ if (!$featureFactory instanceof FeatureFactoryInterface) {
+ $featureFactory = new DataAsFeatures();
+ }
- $tset->setAsKey(TrainingSet::CLASS_AS_KEY);
- foreach ($tset as $class=>$doc) {
- $tokens = $ff->getFeatureArray($class,$doc); // extract tokens from the document
- $tokens = array_fill_keys($tokens,1); // make them occur once
- foreach ($tokens as $token=>$v) {
- if (isset($this->idf[$token]))
+ $trainingSet->setAsKey(TrainingSet::CLASS_AS_KEY);
+ foreach ($trainingSet as $class => $doc) {
+ $tokens = $featureFactory->getFeatureArray($class, $doc); // extract tokens from the document
+ $tokens = array_fill_keys($tokens, 1); // make them occur once
+ foreach (array_keys($tokens) as $token) {
+ if (isset($this->idf[$token])) {
$this->idf[$token]++;
- else
+ } else {
$this->idf[$token] = 1;
+ }
}
}
// this idf so far contains the doc frequency
// we will now inverse it and take the log
- $D = count($tset);
+ $D = count($trainingSet);
foreach ($this->idf as &$v) {
- $v = log($D/$v);
+ $v = log($D / $v);
}
+
$this->logD = log($D);
}
@@ -54,27 +65,17 @@ public function __construct(TrainingSet $tset, FeatureFactoryInterface $ff=null)
* Implements the array access interface. Return the computed idf or
* the logarithm of the count of the documents for a token we have not
* seen before.
- *
- * @param string $token The token to return the idf for
- * @return float The idf
*/
- public function offsetGet($token)
+ public function offsetGet(mixed $token): mixed
{
- if (isset($this->idf[$token])) {
- return $this->idf[$token];
- } else {
- return $this->logD;
- }
+ return $this->idf[$token] ?? $this->logD;
}
/**
* Implements the array access interface. Return true if the token exists
* in the corpus.
- *
- * @param string $token The token to check if it exists in the corpus
- * @return bool
*/
- public function offsetExists($token)
+ public function offsetExists(mixed $token): bool
{
return isset($this->idf[$token]);
}
@@ -83,7 +84,7 @@ public function offsetExists($token)
* Will not be implemented. Throws \BadMethodCallException because
* one should not be able to alter the idf values directly.
*/
- public function offsetSet($token, $value)
+ public function offsetSet(mixed $offset, mixed $value): void
{
throw new \BadMethodCallException("The idf of a specific token cannot be set explicitly");
}
@@ -92,7 +93,7 @@ public function offsetSet($token, $value)
* Will not be implemented. Throws \BadMethodCallException because
* one should not be able to alter the idf values directly.
*/
- public function offsetUnset($token)
+ public function offsetUnset(mixed $offset): void
{
throw new \BadMethodCallException("The idf of a specific token cannot be unset");
}
diff --git a/src/NlpTools/Classifiers/ClassifierInterface.php b/src/NlpTools/Classifiers/ClassifierInterface.php
index 566cf26..2acfff3 100644
--- a/src/NlpTools/Classifiers/ClassifierInterface.php
+++ b/src/NlpTools/Classifiers/ClassifierInterface.php
@@ -1,15 +1,17 @@
$classes
*/
- public function classify(array $classes, \NlpTools\Documents\DocumentInterface $d);
+ public function classify(array $classes, DocumentInterface $document): string;
}
diff --git a/src/NlpTools/Classifiers/FeatureBasedLinearClassifier.php b/src/NlpTools/Classifiers/FeatureBasedLinearClassifier.php
index 206ede3..e2cd8c7 100644
--- a/src/NlpTools/Classifiers/FeatureBasedLinearClassifier.php
+++ b/src/NlpTools/Classifiers/FeatureBasedLinearClassifier.php
@@ -1,10 +1,12 @@
feature_factory = $ff;
- $this->model = $m;
}
/**
* Compute the vote for every class. Return the class that
* receive the maximum vote.
*
- * @param array $classes A set of classes
- * @param DocumentInterface $d A Document
- * @return string A class
+ * @param array $classes
*/
- public function classify(array $classes, DocumentInterface $d)
+ public function classify(array $classes, DocumentInterface $document): string
{
$maxclass = current($classes);
- $maxvote = $this->getVote($maxclass,$d);
+ $maxvote = $this->getVote($maxclass, $document);
while ($class = next($classes)) {
- $v = $this->getVote($class,$d);
- if ($v>$maxvote) {
+ $v = $this->getVote($class, $document);
+ if ($v > $maxvote) {
$maxclass = $class;
$maxvote = $v;
}
@@ -49,17 +42,13 @@ public function classify(array $classes, DocumentInterface $d)
/**
* Compute the features that fire for the Document $d. The sum of
* the weights of the features is the vote.
- *
- * @param string $class The vote for class $class
- * @param DocumentInterface $d The vote for Document $d
- * @return float The vote of the model for class $class and Document $d
*/
- public function getVote($class, DocumentInterface $d)
+ public function getVote(string $class, DocumentInterface $document): float
{
$v = 0;
- $features = $this->feature_factory->getFeatureArray($class,$d);
- foreach ($features as $f) {
- $v += $this->model->getWeight($f);
+ $features = $this->featureFactory->getFeatureArray($class, $document);
+ foreach ($features as $feature) {
+ $v += $this->linearModel->getWeight($feature);
}
return $v;
diff --git a/src/NlpTools/Classifiers/MultinomialNBClassifier.php b/src/NlpTools/Classifiers/MultinomialNBClassifier.php
index 7bdcab5..0679c81 100644
--- a/src/NlpTools/Classifiers/MultinomialNBClassifier.php
+++ b/src/NlpTools/Classifiers/MultinomialNBClassifier.php
@@ -1,5 +1,7 @@
feature_factory = $ff;
- $this->model = $m;
}
/**
@@ -27,17 +22,15 @@ public function __construct(FeatureFactoryInterface $ff, MultinomialNBModelInter
* successively and return that class that has the maximum
* probability.
*
- * @param array $classes The classes from which to choose
- * @param DocumentInterface $d The document to classify
- * @return string $class The class that has the maximum probability
+ * @param array $classes
*/
- public function classify(array $classes, DocumentInterface $d)
+ public function classify(array $classes, DocumentInterface $document): string
{
$maxclass = current($classes);
- $maxscore = $this->getScore($maxclass,$d);
- while ($class=next($classes)) {
- $score = $this->getScore($class,$d);
- if ($score>$maxscore) {
+ $maxscore = $this->getScore($maxclass, $document);
+ while ($class = next($classes)) {
+ $score = $this->getScore($class, $document);
+ if ($score > $maxscore) {
$maxclass = $class;
$maxscore = $score;
}
@@ -53,22 +46,19 @@ public function classify(array $classes, DocumentInterface $d)
*
* @todo perhaps MultinomialNBModel should have precomputed the logs
* ex.: getLogPrior() and getLogCondProb()
- *
- * @param string $class The class for which we are getting a score
- * @param DocumentInterface The document whose score we are getting
- * @return float The log of the probability of $d belonging to $class
*/
- public function getScore($class, DocumentInterface $d)
+ public function getScore(string $class, DocumentInterface $document): float
{
- $score = log($this->model->getPrior($class));
- $features = $this->feature_factory->getFeatureArray($class,$d);
- if (is_int(key($features)))
+ $score = log($this->multinomialNBModel->getPrior($class));
+ $features = $this->featureFactory->getFeatureArray($class, $document);
+ if (is_int(key($features))) {
$features = array_count_values($features);
- foreach ($features as $f=>$fcnt) {
- $score += $fcnt*log($this->model->getCondProb($f,$class));
+ }
+
+ foreach ($features as $f => $fcnt) {
+ $score += $fcnt * log($this->multinomialNBModel->getCondProb($f, $class));
}
return $score;
}
-
}
diff --git a/src/NlpTools/Clustering/CentroidFactories/CentroidFactoryInterface.php b/src/NlpTools/Clustering/CentroidFactories/CentroidFactoryInterface.php
index 3794b5b..c90cfa9 100644
--- a/src/NlpTools/Clustering/CentroidFactories/CentroidFactoryInterface.php
+++ b/src/NlpTools/Clustering/CentroidFactories/CentroidFactoryInterface.php
@@ -1,5 +1,7 @@
$docs The docs from which the centroid will be computed
+ * @param array $choose The indexes from which the centroid will be computed (if empty all the docs will be used)
* @return mixed The centroid. It could be any form of data a number, a vector (it will be the same as the data provided in docs)
*/
- public function getCentroid(array &$docs, array $choose=array());
+ public function getCentroid(array &$docs, array $choose = []): mixed;
}
diff --git a/src/NlpTools/Clustering/CentroidFactories/Euclidean.php b/src/NlpTools/Clustering/CentroidFactories/Euclidean.php
index b0a17bb..565fa2e 100644
--- a/src/NlpTools/Clustering/CentroidFactories/Euclidean.php
+++ b/src/NlpTools/Clustering/CentroidFactories/Euclidean.php
@@ -1,52 +1,61 @@
$doc The doc data to transform to sparse vector
+ * @return array A sparse vector representing the document to the n-dimensional euclidean space
*/
- protected function getVector(array $doc)
+ protected function getVector(array $doc): array
{
- if (is_int(key($doc)))
+ if (is_int(key($doc))) {
return array_count_values($doc);
- else
- return $doc;
+ }
+
+ return $doc;
}
/**
* Compute the mean value for each dimension.
*
- * @param array $docs The docs from which the centroid will be computed
- * @param array $choose The indexes from which the centroid will be computed (if empty all the docs will be used)
- * @return mixed The centroid. It could be any form of data a number, a vector (it will be the same as the data provided in docs)
+ * @param array $docs The docs from which the centroid will be computed
+ * @param array $choose The indexes from which the centroid will be computed (if empty all the docs will be used)
+ * @return array The centroid. It could be any form of data a number, a vector (it will be the same as the data provided in docs)
*/
- public function getCentroid(array &$docs, array $choose=array())
+ public function getCentroid(array &$docs, array $choose = []): array
{
- $v = array();
- if (empty($choose))
- $choose = range(0,count($docs)-1);
+ $v = [];
+ if ($choose === []) {
+ $choose = range(0, count($docs) - 1);
+ }
+
$cnt = count($choose);
foreach ($choose as $idx) {
$doc = $this->getVector($docs[$idx]);
- foreach ($doc as $k=>$w) {
- if (!isset($v[$k]))
+ foreach ($doc as $k => $w) {
+ if (!isset($v[$k])) {
$v[$k] = $w;
- else
+ } else {
$v[$k] += $w;
+ }
}
}
+
foreach ($v as &$w) {
$w /= $cnt;
}
diff --git a/src/NlpTools/Clustering/CentroidFactories/Hamming.php b/src/NlpTools/Clustering/CentroidFactories/Hamming.php
index dbd229a..b335b03 100644
--- a/src/NlpTools/Clustering/CentroidFactories/Hamming.php
+++ b/src/NlpTools/Clustering/CentroidFactories/Hamming.php
@@ -1,5 +1,7 @@
$docs
+ * @param array $choose
*/
- public function getCentroid(array &$docs, array $choose=array())
+ public function getCentroid(array &$docs, array $choose = []): string
{
- $bitl = strlen($docs[0]);
+ $bitl = strlen((string) $docs[0]);
$buckets = array_fill_keys(
- range(0,$bitl-1),
+ range(0, $bitl - 1),
0
);
- if (empty($choose))
- $choose = range(0,count($docs)-1);
+ if ($choose === []) {
+ $choose = range(0, count($docs) - 1);
+ }
+
foreach ($choose as $idx) {
$s = $docs[$idx];
- foreach ($buckets as $i=>&$v) {
- if ($s[$i]=='1')
+ foreach ($buckets as $i => &$v) {
+ if ($s[$i] == '1') {
$v += 1;
- else
+ } else {
$v -= 1;
+ }
}
}
return implode(
'',
array_map(
- function ($v) {
- return ($v>0) ? '1' : '0';
- },
+ // @phpstan-ignore-next-line
+ fn($v): string => ($v > 0) ? '1' : '0',
$buckets
)
);
}
-
}
diff --git a/src/NlpTools/Clustering/CentroidFactories/MeanAngle.php b/src/NlpTools/Clustering/CentroidFactories/MeanAngle.php
index 98b2d70..03444c2 100644
--- a/src/NlpTools/Clustering/CentroidFactories/MeanAngle.php
+++ b/src/NlpTools/Clustering/CentroidFactories/MeanAngle.php
@@ -1,5 +1,7 @@
$v
+ * @return array
+ */
+ protected function normalize(array $v): array
{
$norm = array_reduce(
$v,
- function ($v,$w) {
- return $v+$w*$w;
- }
+ fn($v, $w): float|int => $v + $w * $w
);
$norm = sqrt($norm);
return array_map(
- function ($vi) use ($norm) {
- return $vi/$norm;
- },
+ fn($vi): float => $vi / $norm,
$v
);
}
- public function getCentroid(array &$docs, array $choose=array())
+ /**
+ * @param array $docs
+ * @param array $choose
+ * @return array
+ */
+ public function getCentroid(array &$docs, array $choose = []): array
{
- if (empty($choose))
- $choose = range(0,count($docs)-1);
+ if ($choose === []) {
+ $choose = range(0, count($docs) - 1);
+ }
+
$cnt = count($choose);
- $v = array();
+ $v = [];
foreach ($choose as $idx) {
$d = $this->normalize($this->getVector($docs[$idx]));
- foreach ($d as $i=>$vi) {
- if (!isset($v[$i]))
+ foreach ($d as $i => $vi) {
+ if (!isset($v[$i])) {
$v[$i] = $vi;
- else
+ } else {
$v[$i] += $vi;
+ }
}
}
return array_map(
- function ($vi) use ($cnt) {
- return $vi/$cnt;
- },
+ fn($vi): int|float => $vi / $cnt,
$v
);
}
diff --git a/src/NlpTools/Clustering/Clusterer.php b/src/NlpTools/Clustering/Clusterer.php
index de0500a..5594278 100644
--- a/src/NlpTools/Clustering/Clusterer.php
+++ b/src/NlpTools/Clustering/Clusterer.php
@@ -1,5 +1,7 @@
The clusters, an array containing arrays of offsets for the documents
*/
- abstract public function cluster(TrainingSet $documents, FeatureFactoryInterface $ff);
+ abstract public function cluster(TrainingSet $trainingSet, FeatureFactoryInterface $featureFactory): array;
/**
* Helper function to transform a TrainingSet to an array of feature vectors
+ *
+ * @return array
*/
- protected function getDocumentArray(TrainingSet $documents, FeatureFactoryInterface $ff)
+ protected function getDocumentArray(TrainingSet $trainingSet, FeatureFactoryInterface $featureFactory): array
{
- $docs = array();
- foreach ($documents as $d) {
- $docs[] = $ff->getFeatureArray('',$d);
+ $docs = [];
+ foreach ($trainingSet as $d) {
+ $docs[] = $featureFactory->getFeatureArray('', $d);
}
return $docs;
diff --git a/src/NlpTools/Clustering/Hierarchical.php b/src/NlpTools/Clustering/Hierarchical.php
index a254142..6d5ecd3 100644
--- a/src/NlpTools/Clustering/Hierarchical.php
+++ b/src/NlpTools/Clustering/Hierarchical.php
@@ -1,5 +1,7 @@
strategy = $ms;
- $this->dist = $d;
}
/**
@@ -27,33 +24,35 @@ public function __construct(MergeStrategyInterface $ms, DistanceInterface $d)
* While hierarchical clustering only returns one element, it still wraps it
* in an array to be consistent with the rest of the clustering methods.
*
- * @return array An array containing one element which is the resulting dendrogram
+ * @return array An array containing one element which is the resulting dendrogram
*/
- public function cluster(TrainingSet $documents, FeatureFactoryInterface $ff)
+ public function cluster(TrainingSet $trainingSet, FeatureFactoryInterface $featureFactory): array
{
// what a complete waste of memory here ...
// the same data exists in $documents, $docs and
// the only useful parts are in $this->strategy
- $docs = $this->getDocumentArray($documents, $ff);
- $this->strategy->initializeStrategy($this->dist,$docs);
+ $docs = $this->getDocumentArray($trainingSet, $featureFactory);
+ $this->mergeStrategy->initializeStrategy($this->distance, $docs);
unset($docs); // perhaps save some memory
// start with all the documents being in their
// own cluster we 'll merge later
- $clusters = range(0,count($documents)-1);
+ $clusters = range(0, count($trainingSet) - 1);
+ $i = 0;
$c = count($clusters);
- while ($c>1) {
+ while ($c > 1) {
// ask the strategy which to merge. The strategy
// will assume that we will indeed merge the returned clusters
- list($i,$j) = $this->strategy->getNextMerge();
- $clusters[$i] = array($clusters[$i],$clusters[$j]);
+ [$i, $j] = $this->mergeStrategy->getNextMerge();
+ $clusters[$i] = [$clusters[$i], $clusters[$j]];
unset($clusters[$j]);
$c--;
}
- $clusters = array($clusters[$i]);
+
+ $clusters = [$clusters[$i]];
// return the dendrogram
- return array($clusters);
+ return [$clusters];
}
/**
@@ -61,30 +60,33 @@ public function cluster(TrainingSet $documents, FeatureFactoryInterface $ff)
* number of clusters (the closest power of 2 larger than
* $NC)
*
- * @param array $tree The dendrogram to be flattened
- * @param integer $NC The number of clusters to cut to
- * @return array The flat clusters
+ * @param array $tree The dendrogram to be flattened
+ * @param integer $numberOfClusters The number of clusters to cut to
+ * @return array The flat clusters
*/
- public static function dendrogramToClusters($tree,$NC)
+ public static function dendrogramToClusters(array $tree, int $numberOfClusters): array
{
$clusters = $tree;
- while (count($clusters)<$NC) {
- $tmpc = array();
- foreach ($clusters as $subclust) {
- if (!is_array($subclust))
- $tmpc[] = $subclust;
- else {
- foreach ($subclust as $c)
+ while (count($clusters) < $numberOfClusters) {
+ $tmpc = [];
+ foreach ($clusters as $cluster) {
+ if (!is_array($cluster)) {
+ $tmpc[] = $cluster;
+ } else {
+ foreach ($cluster as $c) {
$tmpc[] = $c;
+ }
}
}
+
$clusters = $tmpc;
}
- foreach ($clusters as &$c) {
- $c = iterator_to_array(
+
+ foreach ($clusters as &$cluster) {
+ $cluster = iterator_to_array(
new \RecursiveIteratorIterator(
new \RecursiveArrayIterator(
- array($c)
+ [$cluster]
)
),
false // do not use keys
diff --git a/src/NlpTools/Clustering/KMeans.php b/src/NlpTools/Clustering/KMeans.php
index 73e94d6..2ea59b7 100644
--- a/src/NlpTools/Clustering/KMeans.php
+++ b/src/NlpTools/Clustering/KMeans.php
@@ -1,5 +1,7 @@
dist = $d;
- $this->n = $n;
- $this->cutoff = $cutoff;
- $this->centroidF = $cf;
}
/**
* Apply the feature factory to the documents and then cluster the resulting array
* using the provided distance metric and centroid factory.
*/
- public function cluster(TrainingSet $documents, FeatureFactoryInterface $ff)
+ public function cluster(TrainingSet $trainingSet, FeatureFactoryInterface $featureFactory): array
{
// transform the documents according to the FeatureFactory
- $docs = $this->getDocumentArray($documents,$ff);
+ $docs = $this->getDocumentArray($trainingSet, $featureFactory);
// choose N centroids at random
- $centroids = array();
- foreach (array_rand($docs,$this->n) as $key) {
+ $centroids = [];
+ foreach (array_rand($docs, $this->n) as $key) {
$centroids[] = $docs[$key];
}
// cache the distance and centroid factory functions for use
// with closures
- $dist = array($this->dist,'dist');
- $cf = array($this->centroidF,'getCentroid');
+ $dist = $this->distance->dist(...);
+ $cf = $this->centroidFactory->getCentroid(...);
// looooooooop
while (true) {
// compute the distance each document has from our centroids
// the array is MxN where M = count($docs) and N = count($centroids)
$distances = array_map(
- function ($doc) use (&$centroids,$dist) {
+ function ($doc) use (&$centroids, $dist): array {
return array_map(
- function ($c) use ($dist,$doc) {
+ fn($c): mixed =>
// it is passed with an array because dist expects references
// and it failed when run with phpunit.
// see http://php.net/manual/en/function.call-user-func.php
// for the solution used below
- return call_user_func_array(
+ call_user_func_array(
$dist,
- array(
- &$c,
- &$doc
- )
- );
- },
+ [&$c, &$doc]
+ ),
$centroids
);
},
@@ -88,23 +77,20 @@ function ($c) use ($dist,$doc) {
// initialize the empty clusters
$clusters = array_fill_keys(
array_keys($centroids),
- array()
+ []
);
- foreach ($distances as $idx=>$d) {
+ foreach ($distances as $idx => $d) {
// assign document idx to the closest centroid
- $clusters[array_search(min($d),$d)][] = $idx;
+ $clusters[array_search(min($d), $d, true)][] = $idx;
}
// compute the new centroids from the assigned documents
// using the centroid factory function
$new_centroids = array_map(
- function ($cluster) use (&$docs,$cf) {
+ function ($cluster) use (&$docs, $cf) {
return call_user_func_array(
$cf,
- array(
- &$docs,
- $cluster
- )
+ [&$docs, $cluster]
);
},
$clusters
@@ -118,9 +104,9 @@ function ($cluster) use (&$docs,$cf) {
);
// if the largest change is small enough we are done
- if (max($changes)<$this->cutoff) {
+ if (max($changes) < $this->cutoff) {
// return the clusters, the centroids and the distances
- return array($clusters,$centroids,$distances);
+ return [$clusters, $centroids, $distances];
}
// update the centroids and loooooop again
diff --git a/src/NlpTools/Clustering/MergeStrategies/CompleteLink.php b/src/NlpTools/Clustering/MergeStrategies/CompleteLink.php
index b0c8ce3..56bb14b 100644
--- a/src/NlpTools/Clustering/MergeStrategies/CompleteLink.php
+++ b/src/NlpTools/Clustering/MergeStrategies/CompleteLink.php
@@ -1,5 +1,7 @@
dm[$xi],$this->dm[$yi]);
+ return max($this->dm[$xi], $this->dm[$yi]);
}
}
diff --git a/src/NlpTools/Clustering/MergeStrategies/GroupAverage.php b/src/NlpTools/Clustering/MergeStrategies/GroupAverage.php
index 12828ba..427c839 100644
--- a/src/NlpTools/Clustering/MergeStrategies/GroupAverage.php
+++ b/src/NlpTools/Clustering/MergeStrategies/GroupAverage.php
@@ -1,5 +1,7 @@
+ */
+ protected array $clusterSize;
+
+ /**
+ * @param array $docs
+ */
+ public function initializeStrategy(DistanceInterface $distance, array &$docs): void
{
- parent::initializeStrategy($d,$docs);
+ parent::initializeStrategy($distance, $docs);
- $this->cluster_size = array_fill_keys(
- range(0,$this->L-1),
+ $this->clusterSize = array_fill_keys(
+ range(0, $this->L - 1),
1
);
}
- protected function newDistance($xi,$yi,$x,$y)
+ protected function newDistance(int $xi, int $yi, int $x, int $y): float
{
- $size_x = $this->cluster_size[$x];
- $size_y = $this->cluster_size[$y];
+ $size_x = $this->clusterSize[$x];
+ $size_y = $this->clusterSize[$y];
- return ($this->dm[$xi]*$size_x + $this->dm[$yi]*$size_y)/($size_x + $size_y);
+ return ($this->dm[$xi] * $size_x + $this->dm[$yi] * $size_y) / ($size_x + $size_y);
}
- public function getNextMerge()
+ /**
+ * @return array
+ */
+ public function getNextMerge(): array
{
$r = parent::getNextMerge();
- $this->cluster_size[$r[0]] += $this->cluster_size[$r[1]];
- unset($this->cluster_size[$r[1]]);
+ $this->clusterSize[$r[0]] += $this->clusterSize[$r[1]];
+ unset($this->clusterSize[$r[1]]);
return $r;
}
diff --git a/src/NlpTools/Clustering/MergeStrategies/HeapLinkage.php b/src/NlpTools/Clustering/MergeStrategies/HeapLinkage.php
index 6564a77..74c7c4b 100644
--- a/src/NlpTools/Clustering/MergeStrategies/HeapLinkage.php
+++ b/src/NlpTools/Clustering/MergeStrategies/HeapLinkage.php
@@ -1,5 +1,7 @@
+ */
+ protected \SplPriorityQueue $queue;
+
+ /**
+ * @var \SplFixedArray
+ */
+ protected \SplFixedArray $dm;
+
+ /**
+ * @var array
+ */
+ protected array $removed;
/**
* Calculate the distance of the merged cluster x,y with cluster i
@@ -32,23 +46,23 @@ abstract class HeapLinkage implements MergeStrategyInterface
* Ex.: for single link this function would be
* return min($this->dm[$xi],$this->dm[$yi]);
*/
- abstract protected function newDistance($xi,$yi,$x,$y);
+ abstract protected function newDistance(int $xi, int $yi, int $x, int $y): float;
/**
* Initialize the distance matrix and any other data structure needed
* to calculate the merges later.
*
- * @param DistanceInterface $d The distance metric used to calculate the distance matrix
- * @param array $docs The docs to be clustered
+ * @param DistanceInterface $distance The distance metric used to calculate the distance matrix
+ * @param array $docs The docs to be clustered
*/
- public function initializeStrategy(DistanceInterface $d, array &$docs)
+ public function initializeStrategy(DistanceInterface $distance, array &$docs): void
{
// the number of documents and the dimensions of the matrix
$this->L = count($docs);
// just to hold which document has been removed
- $this->removed = array_fill_keys(range(0, $this->L-1), false);
+ $this->removed = array_fill_keys(range(0, $this->L - 1), false);
// how many distances we must compute
- $elements = (int) ($this->L*($this->L-1))/2;
+ $elements = $this->L * ($this->L - 1) / 2;
// the containers that will hold the distances
$this->dm = new \SplFixedArray($elements);
$this->queue = new \SplPriorityQueue();
@@ -56,10 +70,10 @@ public function initializeStrategy(DistanceInterface $d, array &$docs)
// for each unique pair of documents calculate the distance and
// save it in the heap and distance matrix
- for ($x=0;$x<$this->L;$x++) {
- for ($y=$x+1;$y<$this->L;$y++) {
- $index = $this->packIndex($y,$x);
- $tmp_d = $d->dist($docs[$x],$docs[$y]);
+ for ($x = 0; $x < $this->L; $x++) {
+ for ($y = $x + 1; $y < $this->L; $y++) {
+ $index = $this->packIndex($y, $x);
+ $tmp_d = $distance->dist($docs[$x], $docs[$y]);
$this->dm[$index] = $tmp_d;
$this->queue->insert($index, -$tmp_d);
}
@@ -73,52 +87,54 @@ public function initializeStrategy(DistanceInterface $d, array &$docs)
* 3. Merge the clusters (by labeling one as removed)
* 4. Reheap
*
- * @return array The pair (x,y) to be merged
+ * @return array The pair (x,y) to be merged
*/
- public function getNextMerge()
+ public function getNextMerge(): array
{
// extract the pair with the smallest distance
$tmp = $this->queue->extract();
$index = $tmp["data"];
$d = -$tmp["priority"];
- list($y,$x) = $this->unravelIndex($index);
+ [$y, $x] = $this->unravelIndex($index);
// check if it is invalid
- while ($this->removed[$y] || $this->removed[$x] || $this->dm[$index]!=$d) {
+ while ($this->removed[$y] || $this->removed[$x] || $this->dm[$index] != $d) {
$tmp = $this->queue->extract();
$index = $tmp["data"];
$d = -$tmp["priority"];
- list($y,$x) = $this->unravelIndex($index);
+ [$y, $x] = $this->unravelIndex($index);
}
// Now that we have a valid pair to be merged
// calculate the distances of the merged cluster with any
// other cluster
- $yi = $this->packIndex($y,0);
- $xi = $this->packIndex($x,0);
+ $yi = $this->packIndex($y, 0);
+ $xi = $this->packIndex($x, 0);
// for every cluster with index inewDistance($xi,$yi,$x,$y);
- if ($d!=$this->dm[$xi]) {
+ for ($i = 0; $i < $x; $i++,$yi++,$xi++) {
+ $d = $this->newDistance($xi, $yi, $x, $y);
+ if ($d != $this->dm[$xi]) {
$this->dm[$xi] = $d;
$this->queue->insert($xi, -$d);
}
}
+
// for every cluster with index xpackIndex($i,$x);
- $d = $this->newDistance($xi,$yi,$x,$y);
- if ($d!=$this->dm[$xi]) {
+ for ($i = $x + 1; $i < $y; $i++,$yi++) {
+ $xi = $this->packIndex($i, $x);
+ $d = $this->newDistance($xi, $yi, $x, $y);
+ if ($d != $this->dm[$xi]) {
$this->dm[$xi] = $d;
$this->queue->insert($xi, -$d);
}
}
+
// for every cluster xL;$i++) {
- $xi = $this->packIndex($i,$x);
- $yi = $this->packIndex($i,$y);
- $d = $this->newDistance($xi,$yi,$x,$y);
- if ($d!=$this->dm[$xi]) {
+ for ($i = $y + 1; $i < $this->L; $i++) {
+ $xi = $this->packIndex($i, $x);
+ $yi = $this->packIndex($i, $y);
+ $d = $this->newDistance($xi, $yi, $x, $y);
+ if ($d != $this->dm[$xi]) {
$this->dm[$xi] = $d;
$this->queue->insert($xi, -$d);
}
@@ -127,7 +143,7 @@ public function getNextMerge()
// mark y as removed
$this->removed[$y] = true;
- return array($x,$y);
+ return [$x, $y];
}
/**
@@ -138,20 +154,22 @@ public function getNextMerge()
* Note: y will always be larger than x
*
* @param integer $index The index to be unraveled
- * @return array An array containing (y,x)
+ * @return array An array containing (y,x)
*/
- protected function unravelIndex($index)
+ protected function unravelIndex(int $index): array
{
$a = 0;
- $b = $this->L-1;
+ $b = $this->L - 1;
$y = 0;
- while ($b-$a > 1) {
+ $i = 0;
+
+ while ($b - $a > 1) {
// the middle row in the interval [a,b]
- $y = (int) (($a+$b)/2);
+ $y = (int) (($a + $b) / 2);
// the candidate index aka how many points until this row
- $i = $y*($y-1)/2;
+ $i = $y * ($y - 1) / 2;
- // if we need an offset les then the wanted y will be in the offset [a,y]
+ // if we need an offset less then the wanted y will be in the offset [a,y]
if ($i > $index) {
$b = $y;
} else {
@@ -159,23 +177,21 @@ protected function unravelIndex($index)
$a = $y;
}
}
+
// we have finished searching it is either a or b
$x = $index - $i;
// this means that it is b and we have a
if ($y <= $x) {
$y++;
- $x = $index - $y*($y-1)/2;
+ $x = $index - $y * ($y - 1) / 2;
} elseif ($x < 0) {
// this means that it is a and we have b
$y--;
- $x = $index - $y*($y-1)/2;
+ $x = $index - $y * ($y - 1) / 2;
}
- return array(
- (int) $y,
- (int) $x
- );
+ return [$y, (int) $x];
}
/**
@@ -190,8 +206,8 @@ protected function unravelIndex($index)
* @param integer $x The x coordinate (small)
* @return integer The offset in the low triangle matri containing the item (x,y)
*/
- protected function packIndex($y, $x)
+ protected function packIndex(int $y, int $x): int
{
- return $y*($y-1)/2 + $x;
+ return $y * ($y - 1) / 2 + $x;
}
}
diff --git a/src/NlpTools/Clustering/MergeStrategies/MergeStrategyInterface.php b/src/NlpTools/Clustering/MergeStrategies/MergeStrategyInterface.php
index 47b27f5..afd5b72 100644
--- a/src/NlpTools/Clustering/MergeStrategies/MergeStrategyInterface.php
+++ b/src/NlpTools/Clustering/MergeStrategies/MergeStrategyInterface.php
@@ -1,5 +1,7 @@
$docs
*/
- public function initializeStrategy(DistanceInterface $d, array &$docs);
+ public function initializeStrategy(DistanceInterface $distance, array &$docs): void;
/**
* Return the next two clusters for merging and assume
* they are merged (ex. update a similarity matrix)
*
- * @return array An array with two numbers which are the cluster ids
+ * @return array An array with two numbers which are the cluster ids
*/
- public function getNextMerge();
+ public function getNextMerge(): array;
}
diff --git a/src/NlpTools/Clustering/MergeStrategies/SingleLink.php b/src/NlpTools/Clustering/MergeStrategies/SingleLink.php
index 299f72b..1dca89d 100644
--- a/src/NlpTools/Clustering/MergeStrategies/SingleLink.php
+++ b/src/NlpTools/Clustering/MergeStrategies/SingleLink.php
@@ -1,5 +1,7 @@
dm[$xi],$this->dm[$yi]);
+ return min($this->dm[$xi], $this->dm[$yi]);
}
}
diff --git a/src/NlpTools/Documents/DocumentInterface.php b/src/NlpTools/Documents/DocumentInterface.php
index 8118dc8..73b2f1b 100644
--- a/src/NlpTools/Documents/DocumentInterface.php
+++ b/src/NlpTools/Documents/DocumentInterface.php
@@ -1,5 +1,7 @@
data = $data;
}
- public function getDocumentData()
+ public function getDocumentData(): ?string
{
return $this->data;
}
- public function applyTransformation(TransformationInterface $transform)
+ public function applyTransformation(TransformationInterface $transformation): void
+ {
+ $this->data = $transformation->transform($this->data);
+ }
+
+ public function getClass(): string
{
- $this->data = $transform->transform($this->data);
+ return self::class;
}
}
diff --git a/src/NlpTools/Documents/TokensDocument.php b/src/NlpTools/Documents/TokensDocument.php
index 143fc1c..9c72f07 100644
--- a/src/NlpTools/Documents/TokensDocument.php
+++ b/src/NlpTools/Documents/TokensDocument.php
@@ -1,5 +1,7 @@
$tokens
+ */
+ public function __construct(protected array $tokens)
{
- $this->tokens = $tokens;
}
+
/**
* Simply return the tokens received in the constructor
- * @return array The tokens array
+ *
+ * @return array
*/
- public function getDocumentData()
+ public function getDocumentData(): array
{
return $this->tokens;
}
@@ -26,21 +31,24 @@ public function getDocumentData()
/**
* Apply the transform to each token. Filter out the null tokens.
*
- * @param TransformationInterface $transform The transformation to be applied
+ * @param TransformationInterface $transformation The transformation to be applied
*/
- public function applyTransformation(TransformationInterface $transform)
+ public function applyTransformation(TransformationInterface $transformation): void
{
// array_values for re-indexing
$this->tokens = array_values(
array_filter(
array_map(
- array($transform, 'transform'),
+ $transformation->transform(...),
$this->tokens
),
- function ($token) {
- return $token!==null;
- }
+ fn($token): bool => $token !== null
)
);
}
+
+ public function getClass(): string
+ {
+ return self::class;
+ }
}
diff --git a/src/NlpTools/Documents/TrainingDocument.php b/src/NlpTools/Documents/TrainingDocument.php
index 42b9348..fc9738b 100644
--- a/src/NlpTools/Documents/TrainingDocument.php
+++ b/src/NlpTools/Documents/TrainingDocument.php
@@ -1,8 +1,11 @@
d = $d;
- $this->class = $class;
}
- public function getDocumentData()
+
+ /**
+ * @return array
+ */
+ public function getDocumentData(): array
{
- return $this->d->getDocumentData();
+ return $this->document->getDocumentData();
}
- public function getClass()
+
+ public function getClass(): string
{
return $this->class;
}
/**
* Pass the transformation to the decorated document
- *
- * @param TransformationInterface $transform The transformation to be applied
*/
- public function applyTransformation(TransformationInterface $transform)
+ public function applyTransformation(TransformationInterface $transformation): void
{
- $this->d->applyTransformation($transform);
+ $this->document->applyTransformation($transformation);
}
}
diff --git a/src/NlpTools/Documents/TrainingSet.php b/src/NlpTools/Documents/TrainingSet.php
index ba627f4..533cf97 100644
--- a/src/NlpTools/Documents/TrainingSet.php
+++ b/src/NlpTools/Documents/TrainingSet.php
@@ -1,46 +1,60 @@
+ * @implements \ArrayAccess
*/
-class TrainingSet implements \Iterator,\ArrayAccess,\Countable
+class TrainingSet implements \Iterator, \ArrayAccess, \Countable
{
- const CLASS_AS_KEY = 1;
- const OFFSET_AS_KEY = 2;
+ public const CLASS_AS_KEY = 1;
- // An array that contains all the classes present in the TrainingSet
- protected $classSet;
- protected $documents; // The documents container
+ public const OFFSET_AS_KEY = 2;
+
+ /**
+ * An array that contains all the classes present in the TrainingSet
+ *
+ * @var array
+ */
+ protected array $classSet = [];
+
+ /**
+ * The documents container
+ *
+ * @var array
+ */
+ protected array $documents = [];
// When iterated upon what should the key be?
- protected $keytype;
- // When iterated upon the currentDocument
- protected $currentDocument;
+ protected int $keytype = self::CLASS_AS_KEY;
- public function __construct()
- {
- $this->classSet = array();
- $this->documents = array();
- $this->keytype = self::CLASS_AS_KEY;
- }
+ // When iterated upon the currentDocument
+ protected DocumentInterface|false $currentDocument;
/**
* Add a document to the set.
- *
- * @param $class The documents actual class
- * @param $d The Document
- * @return void
*/
- public function addDocument($class, DocumentInterface $d)
+ public function addDocument(string $class, DocumentInterface $document): void
{
- $this->documents[] = new TrainingDocument($class,$d);
+ $this->documents[] = new TrainingDocument($class, $document);
$this->classSet[$class] = 1;
}
- // return the classset
- public function getClassSet()
+
+ /**
+ * Return the classset
+ *
+ * @return array
+ */
+ public function getClassSet(): array
{
return array_keys($this->classSet);
}
@@ -48,86 +62,86 @@ public function getClassSet()
/**
* Decide what should be returned as key when iterated upon
*/
- public function setAsKey($what)
+ public function setAsKey(int $what): void
{
- switch ($what) {
- case self::CLASS_AS_KEY:
- case self::OFFSET_AS_KEY:
- $this->keytype = $what;
- break;
- default:
- $this->keytype = self::CLASS_AS_KEY;
- break;
- }
+ $this->keytype = match ($what) {
+ self::CLASS_AS_KEY, self::OFFSET_AS_KEY => $what,
+ default => self::CLASS_AS_KEY,
+ };
}
/**
* Apply an array of transformations to all documents in this container.
*
- * @param array An array of TransformationInterface instances
+ * @param array $transforms An array of TransformationInterface instances
*/
- public function applyTransformations(array $transforms)
+ public function applyTransformations(array $transforms): void
{
- foreach ($this->documents as $doc) {
+ foreach ($this->documents as $document) {
foreach ($transforms as $transform) {
- $doc->applyTransformation($transform);
+ $document->applyTransformation($transform);
}
}
}
// ====== Implementation of \Iterator interface =========
- public function rewind()
+ public function rewind(): void
{
reset($this->documents);
$this->currentDocument = current($this->documents);
}
- public function next()
+
+ public function next(): void
{
$this->currentDocument = next($this->documents);
}
- public function valid()
+
+ public function valid(): bool
{
- return $this->currentDocument!=false;
+ return $this->currentDocument !== false;
}
- public function current()
+
+ public function current(): DocumentInterface
{
return $this->currentDocument;
}
- public function key()
+
+ public function key(): string
{
- switch ($this->keytype) {
- case self::CLASS_AS_KEY:
- return $this->currentDocument->getClass();
- case self::OFFSET_AS_KEY:
- return key($this->documents);
- default:
- // we should never be here
- throw new \Exception("Undefined type as key");
- }
+ return match ($this->keytype) {
+ self::CLASS_AS_KEY => $this->currentDocument->getClass(),
+ self::OFFSET_AS_KEY => key($this->documents),
+ default => throw new \Exception("Undefined type as key"),
+ };
}
+
// === Implementation of \Iterator interface finished ===
// ====== Implementation of \ArrayAccess interface =========
- public function offsetSet($key,$value)
+ public function offsetSet($key, $value): void
{
throw new \Exception("Shouldn't add documents this way, add them through addDocument()");
}
- public function offsetUnset($key)
+
+ public function offsetUnset($key): void
{
throw new \Exception("Cannot unset any document");
}
- public function offsetGet($key)
+
+ public function offsetGet($key): DocumentInterface
{
return $this->documents[$key];
}
- public function offsetExists($key)
+
+ public function offsetExists($key): bool
{
return isset($this->documents[$key]);
}
+
// === Implementation of \ArrayAccess interface finished ===
// implementation of \Countable interface
- public function count()
+ public function count(): int
{
return count($this->documents);
}
diff --git a/src/NlpTools/Documents/WordDocument.php b/src/NlpTools/Documents/WordDocument.php
index a69162a..a9261fc 100644
--- a/src/NlpTools/Documents/WordDocument.php
+++ b/src/NlpTools/Documents/WordDocument.php
@@ -1,5 +1,7 @@
+ */
+ protected array $before = [];
+
+ /**
+ * @var array
+ */
+ protected array $after = [];
+
+ /**
+ * @param array $tokens
+ */
+ public function __construct(array $tokens, int $index, int $context)
{
$this->word = $tokens[$index];
-
- $this->before = array();
- for ($start = max($index-$context,0);$start<$index;$start++) {
+ for ($start = max($index - $context, 0); $start < $index; $start++) {
$this->before[] = $tokens[$start];
}
- $this->after = array();
- $end = min($index+$context+1,count($tokens));
- for ($start = $index+1;$start<$end;$start++) {
+ $end = min($index + $context + 1, count($tokens));
+ for ($start = $index + 1; $start < $end; $start++) {
$this->after[] = $tokens[$start];
}
}
@@ -34,11 +45,11 @@ public function __construct(array $tokens, $index, $context)
* the second element being an array of previous words, and the
* third an array of following words
*
- * @return array
+ * @return array
*/
- public function getDocumentData()
+ public function getDocumentData(): array
{
- return array($this->word,$this->before,$this->after);
+ return [$this->word, $this->before, $this->after];
}
/**
@@ -46,20 +57,18 @@ public function getDocumentData()
* Filter out the null tokens from the context. If the word is transformed
* to null it is for the feature factory to decide what to do.
*
- * @param TransformationInterface $transform The transformation to be applied
+ * @param TransformationInterface $transformation The transformation to be applied
*/
- public function applyTransformation(TransformationInterface $transform)
+ public function applyTransformation(TransformationInterface $transformation): void
{
- $null_filter = function ($token) {
- return $token!==null;
- };
+ $null_filter = fn($token): bool => $token !== null;
- $this->word = $transform->transform($this->word);
+ $this->word = $transformation->transform($this->word);
// array_values for re-indexing
$this->before = array_values(
array_filter(
array_map(
- array($transform,"transform"),
+ $transformation->transform(...),
$this->before
),
$null_filter
@@ -68,11 +77,16 @@ public function applyTransformation(TransformationInterface $transform)
$this->after = array_values(
array_filter(
array_map(
- array($transform,"transform"),
+ $transformation->transform(...),
$this->after
),
$null_filter
)
);
}
+
+ public function getClass(): string
+ {
+ return self::class;
+ }
}
diff --git a/src/NlpTools/Exceptions/InvalidExpression.php b/src/NlpTools/Exceptions/InvalidExpression.php
index 24428e9..0f9dc2b 100644
--- a/src/NlpTools/Exceptions/InvalidExpression.php
+++ b/src/NlpTools/Exceptions/InvalidExpression.php
@@ -1,4 +1,7 @@
getDocumentData();
+ return $document->getDocumentData();
}
}
diff --git a/src/NlpTools/FeatureFactories/FeatureFactoryInterface.php b/src/NlpTools/FeatureFactories/FeatureFactoryInterface.php
index 83cfb9e..5404d03 100644
--- a/src/NlpTools/FeatureFactories/FeatureFactoryInterface.php
+++ b/src/NlpTools/FeatureFactories/FeatureFactoryInterface.php
@@ -1,5 +1,7 @@
*/
- public function getFeatureArray($class, DocumentInterface $d);
+ public function getFeatureArray(string $class, DocumentInterface $document): array;
}
diff --git a/src/NlpTools/FeatureFactories/FunctionFeatures.php b/src/NlpTools/FeatureFactories/FunctionFeatures.php
index f3e8b2b..1ba3838 100644
--- a/src/NlpTools/FeatureFactories/FunctionFeatures.php
+++ b/src/NlpTools/FeatureFactories/FunctionFeatures.php
@@ -1,8 +1,10 @@
$functions
*/
- public function __construct(array $f=array())
+ public function __construct(protected array $functions = [])
{
- $this->functions=$f;
- $this->frequency=false;
}
+
/**
* Set the feature factory to model frequency instead of presence
*/
- public function modelFrequency()
+ public function modelFrequency(): void
{
$this->frequency = true;
}
+
/**
* Set the feature factory to model presence instead of frequency
*/
- public function modelPresence()
+ public function modelPresence(): void
{
$this->frequency = false;
}
+
/**
* Add a function as a feature
- *
- * @param callable $feature
*/
- public function add( $feature )
+ public function add(callable $feature): void
{
$this->functions[] = $feature;
}
@@ -58,36 +57,39 @@ public function add( $feature )
* the feature set. If the return value is an array iterate over it
* and add each value to the feature set.
*
- * @param string $class The class for which we are calculating features
- * @param DocumentInterface $d The document for which we are calculating features
- * @return array
+ * @return array
*/
- public function getFeatureArray($class, DocumentInterface $d)
+ public function getFeatureArray(string $class, DocumentInterface $document): array
{
$features = array_filter(
- array_map( function ($feature) use ($class,$d) {
- return call_user_func($feature, $class, $d);
- },
+ array_map(
+ fn($feature): mixed => call_user_func($feature, $class, $document),
$this->functions
- ));
- $set = array();
- foreach ($features as $f) {
- if (is_array($f)) {
- foreach ($f as $ff) {
- if (!isset($set[$ff]))
+ )
+ );
+ $set = [];
+ foreach ($features as $feature) {
+ if (is_array($feature)) {
+ foreach ($feature as $ff) {
+ if (!isset($set[$ff])) {
$set[$ff] = 0;
+ }
+
$set[$ff]++;
}
} else {
- if (!isset($set[$f]))
- $set[$f] = 0;
- $set[$f]++;
+ if (!isset($set[$feature])) {
+ $set[$feature] = 0;
+ }
+
+ $set[$feature]++;
}
}
- if ($this->frequency)
+
+ if ($this->frequency) {
return $set;
- else
- return array_keys($set);
- }
+ }
+ return array_keys($set);
+ }
}
diff --git a/src/NlpTools/Models/FeatureBasedNB.php b/src/NlpTools/Models/FeatureBasedNB.php
index 556c6a5..0072d10 100644
--- a/src/NlpTools/Models/FeatureBasedNB.php
+++ b/src/NlpTools/Models/FeatureBasedNB.php
@@ -1,9 +1,11 @@
priors = array();
- $this->condprob = array();
- $this->unknown = array();
- }
+ /**
+ * Computed prior probabilities
+ *
+ * @var array
+ */
+ protected array $priors = [];
+
+ /**
+ * Computed conditional probabilites
+ *
+ * @var array
+ */
+ protected array $condprob = [];
+
+ /**
+ * Probability for each unknown word in a class a/(len(terms[class])+a*len(V))
+ *
+ * @var array
+ */
+ protected array $unknown = [];
/**
* Return the prior probability of class $class
* P(c) as computed by the training data
- *
- * @param string $class
- * @return float prior probability
*/
- public function getPrior($class)
+ public function getPrior(string $class): float
{
- return isset($this->priors[$class])
- ? $this->priors[$class]
- : 0;
+ return $this->priors[$class] ?? 0;
}
/**
@@ -44,19 +48,14 @@ public function getPrior($class)
*
* @param string $term The term (word, feature id, ...)
* @param string $class The class
- * @return float
*/
- public function getCondProb($term,$class)
+ public function getCondProb(string $term, string $class): float
{
if (!isset($this->condprob[$term][$class])) {
-
- return isset($this->unknown[$class])
- ? $this->unknown[$class]
- : 0;
-
- } else {
- return $this->condprob[$term][$class];
+ return $this->unknown[$class] ?? 0;
}
+
+ return $this->condprob[$term][$class];
}
/**
@@ -67,38 +66,38 @@ public function getCondProb($term,$class)
* It can be used for incremental training. It is not meant to be used
* with the same training set twice.
*
- * @param array $train_ctx The previous training context
- * @param FeatureFactoryInterface $ff A feature factory to compute features from a training document
- * @param TrainingSet The training set
- * @param integer $a_smoothing The parameter for additive smoothing. Defaults to add-one smoothing.
- * @return array Return a training context to be used for further incremental training,
+ * @param array $trainContext The previous training context
+ * @param FeatureFactoryInterface $featureFactory A feature factory to compute features from a training document
+ * @param TrainingSet $trainingSet The training set
+ * @param integer $additiveSmoothing The parameter for additive smoothing. Defaults to add-one smoothing.
+ * @return array Return a training context to be used for further incremental training,
* although this is not necessary since the changes also happen in place
*/
- public function train_with_context(array &$train_ctx, FeatureFactoryInterface $ff, TrainingSet $tset, $a_smoothing=1)
+ public function trainWithContext(array &$trainContext, FeatureFactoryInterface $featureFactory, TrainingSet $trainingSet, int $additiveSmoothing = 1): array
{
$this->countTrainingSet(
- $ff,
- $tset,
- $train_ctx['termcount_per_class'],
- $train_ctx['termcount'],
- $train_ctx['ndocs_per_class'],
- $train_ctx['voc'],
- $train_ctx['ndocs']
- );
+ $featureFactory,
+ $trainingSet,
+ $trainContext['termcount_per_class'],
+ $trainContext['termcount'],
+ $trainContext['ndocs_per_class'],
+ $trainContext['voc'],
+ $trainContext['ndocs']
+ );
- $voccount = count($train_ctx['voc']);
+ $voccount = count($trainContext['voc']);
$this->computeProbabilitiesFromCounts(
- $tset->getClassSet(),
- $train_ctx['termcount_per_class'],
- $train_ctx['termcount'],
- $train_ctx['ndocs_per_class'],
- $train_ctx['ndocs'],
- $voccount,
- $a_smoothing
- );
-
- return $train_ctx;
+ $trainingSet->getClassSet(),
+ $trainContext['termcount_per_class'],
+ $trainContext['termcount'],
+ $trainContext['ndocs_per_class'],
+ $trainContext['ndocs'],
+ $voccount,
+ $additiveSmoothing
+ );
+
+ return $trainContext;
}
/**
@@ -111,24 +110,18 @@ public function train_with_context(array &$train_ctx, FeatureFactoryInterface $f
* More information on the algorithm can be found at
* http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html
*
- * @param FeatureFactoryInterface A feature factory to compute features from a training document
- * @param TrainingSet The training set
- * @param integer $a_smoothing The parameter for additive smoothing. Defaults to add-one smoothing.
- * @return array Return a training context to be used for incremental training
+ * @param FeatureFactoryInterface $featureFactory A feature factory to compute features from a training document
+ * @param TrainingSet $trainingSet The training set
+ * @param integer $additiveSmoothing The parameter for additive smoothing. Defaults to add-one smoothing.
+ * @return array Return a training context to be used for incremental training
*/
- public function train(FeatureFactoryInterface $ff, TrainingSet $tset, $a_smoothing=1)
+ public function train(FeatureFactoryInterface $featureFactory, TrainingSet $trainingSet, int $additiveSmoothing = 1): array
{
- $class_set = $tset->getClassSet();
-
- $ctx = array(
- 'termcount_per_class'=>array_fill_keys($class_set,0),
- 'termcount'=>array_fill_keys($class_set,array()),
- 'ndocs_per_class'=>array_fill_keys($class_set,0),
- 'voc'=>array(),
- 'ndocs'=>0
- );
+ $class_set = $trainingSet->getClassSet();
+
+ $ctx = ['termcount_per_class' => array_fill_keys($class_set, 0), 'termcount' => array_fill_keys($class_set, []), 'ndocs_per_class' => array_fill_keys($class_set, 0), 'voc' => [], 'ndocs' => 0];
- return $this->train_with_context($ctx,$ff,$tset,$a_smoothing);
+ return $this->trainWithContext($ctx, $featureFactory, $trainingSet, $additiveSmoothing);
}
/**
@@ -136,33 +129,37 @@ public function train(FeatureFactoryInterface $ff, TrainingSet $tset, $a_smoothi
* by reference and they are filled in this function. Useful for not
* making copies of big arrays.
*
- * @param FeatureFactoryInterface $ff A feature factory to create the features for each document in the set
- * @param TrainingSet $tset The training set (collection of labeled documents)
- * @param array $termcount_per_class The count of occurences of each feature in each class
- * @param array $termcount The total count of occurences of each term
- * @param array $ndocs_per_class The total number of documents per class
- * @param array $voc A set of the found features
+ * @param FeatureFactoryInterface $featureFactory A feature factory to create the features for each document in the set
+ * @param TrainingSet $trainingSet The training set (collection of labeled documents)
+ * @param array $termcountPerClass The count of occurences of each feature in each class
+ * @param array $termcount The total count of occurences of each term
+ * @param array $ndocsPerClass The total number of documents per class
+ * @param array $voc A set of the found features
* @param integer $ndocs The number of documents
* @return void
*/
- protected function countTrainingSet(FeatureFactoryInterface $ff, TrainingSet $tset, array &$termcount_per_class, array &$termcount, array &$ndocs_per_class, array &$voc, &$ndocs)
+ protected function countTrainingSet(FeatureFactoryInterface $featureFactory, TrainingSet $trainingSet, array &$termcountPerClass, array &$termcount, array &$ndocsPerClass, array &$voc, int &$ndocs)
{
- foreach ($tset as $tdoc) {
+ foreach ($trainingSet as $tdoc) {
$ndocs++;
$c = $tdoc->getClass();
- $ndocs_per_class[$c]++;
- $features = $ff->getFeatureArray($c,$tdoc);
- if (is_int(key($features)))
+ $ndocsPerClass[$c]++;
+ $features = $featureFactory->getFeatureArray($c, $tdoc);
+ if (is_int(key($features))) {
$features = array_count_values($features);
- foreach ($features as $f=>$fcnt) {
- if (!isset($voc[$f]))
+ }
+
+ foreach ($features as $f => $fcnt) {
+ if (!isset($voc[$f])) {
$voc[$f] = 0;
+ }
- $termcount_per_class[$c]+=$fcnt;
- if (isset($termcount[$c][$f]))
- $termcount[$c][$f]+=$fcnt;
- else
+ $termcountPerClass[$c] += $fcnt;
+ if (isset($termcount[$c][$f])) {
+ $termcount[$c][$f] += $fcnt;
+ } else {
$termcount[$c][$f] = $fcnt;
+ }
}
}
}
@@ -171,25 +168,26 @@ protected function countTrainingSet(FeatureFactoryInterface $ff, TrainingSet $ts
* Compute the probabilities given the counts of the features in the
* training set.
*
- * @param array $class_set Just the array that contains the classes
- * @param array $termcount_per_class The count of occurences of each feature in each class
- * @param array $termcount The total count of occurences of each term
- * @param array $ndocs_per_class The total number of documents per class
+ * @param array $class_set Just the array that contains the classes
+ * @param array $termcountPerClass The count of occurences of each feature in each class
+ * @param array $termcount The total count of occurences of each term
+ * @param array $ndocsPerClass The total number of documents per class
* @param integer $ndocs The total number of documents
* @param integer $voccount The total number of features found
* @return void
*/
- protected function computeProbabilitiesFromCounts(array $class_set, array &$termcount_per_class, array &$termcount, array &$ndocs_per_class, $ndocs, $voccount, $a_smoothing=1)
+ protected function computeProbabilitiesFromCounts(array $class_set, array &$termcountPerClass, array &$termcount, array &$ndocsPerClass, int $ndocs, int $voccount, int $additiveSmoothing = 1)
{
- $denom_smoothing = $a_smoothing*$voccount;
+ $denom_smoothing = $additiveSmoothing * $voccount;
foreach ($class_set as $class) {
- $this->priors[$class] = $ndocs_per_class[$class] / $ndocs;
- foreach ($termcount[$class] as $term=>$count) {
- $this->condprob[$term][$class] = ($count + $a_smoothing) / ($termcount_per_class[$class] + $denom_smoothing);
+ $this->priors[$class] = $ndocsPerClass[$class] / $ndocs;
+ foreach ($termcount[$class] as $term => $count) {
+ $this->condprob[$term][$class] = ($count + $additiveSmoothing) / ($termcountPerClass[$class] + $denom_smoothing);
}
}
+
foreach ($class_set as $class) {
- $this->unknown[$class] = $a_smoothing / ($termcount_per_class[$class] + $denom_smoothing);
+ $this->unknown[$class] = $additiveSmoothing / ($termcountPerClass[$class] + $denom_smoothing);
}
}
@@ -198,6 +196,6 @@ protected function computeProbabilitiesFromCounts(array $class_set, array &$term
*/
public function __sleep()
{
- return array('priors','condprob','unknown');
+ return ['priors', 'condprob', 'unknown'];
}
}
diff --git a/src/NlpTools/Models/Lda.php b/src/NlpTools/Models/Lda.php
index bec01d2..9e3a56e 100644
--- a/src/NlpTools/Models/Lda.php
+++ b/src/NlpTools/Models/Lda.php
@@ -1,5 +1,7 @@
+ */
+ protected array $count_docs_topics;
+
+ /**
+ * @var array
+ */
+ protected array $count_topics_words;
+
+ /**
+ * @var array
+ */
+ protected array $words_in_doc;
+
+ /**
+ * @var array
+ */
+ protected array $words_in_topic;
- protected $ntopics;
- protected $a;
- protected $b;
+ /**
+ * @var array
+ */
+ protected array $word_doc_assigned_topic;
- protected $mt;
+ protected int $voccnt;
- protected $count_docs_topics;
- protected $count_topics_words;
- protected $words_in_doc;
- protected $words_in_topic;
- protected $word_doc_assigned_topic;
- protected $voccnt;
- protected $voc;
+ /**
+ * @var array
+ */
+ protected array $voc;
/**
- * @param FeatureFactoryInterface $ff The feature factory will be applied to each document and the resulting feature array will be considered as a document for LDA
+ * @param FeatureFactoryInterface $featureFactory The feature factory will be applied to each document and the resulting feature array will be considered as a document for LDA
* @param integer $ntopics The number of topics assumed by the model
* @param float $a The dirichlet prior assumed for the per document topic distribution
* @param float $b The dirichlet prior assumed for the per word topic distribution
*/
- public function __construct(FeatureFactoryInterface $ff,$ntopics,$a=1,$b=1)
+ public function __construct(protected FeatureFactoryInterface $featureFactory, protected int $ntopics, protected float $a = 1, protected float $b = 1)
{
- $this->ff = $ff;
-
- $this->ntopics = $ntopics;
- $this->a = $a;
- $this->b = $b;
-
$this->mt = new MersenneTwister();
}
/**
* Generate an array suitable for use with Lda::initialize and
* Lda::gibbsSample from a training set.
+ *
+ * @return array
*/
- public function generateDocs(TrainingSet $tset)
+ public function generateDocs(TrainingSet $trainingSet): array
{
- $docs = array();
- foreach ($tset as $d)
- $docs[] = $this->ff->getFeatureArray('',$d);
+ $docs = [];
+ foreach ($trainingSet as $d) {
+ $docs[] = $this->featureFactory->getFeatureArray('', $d);
+ }
return $docs;
}
@@ -70,12 +87,12 @@ public function generateDocs(TrainingSet $tset)
* Count initially the co-occurences of documents,topics and
* topics,words and cache them to run Gibbs sampling faster
*
- * @param array $docs The docs that we will use to generate the sample
+ * @param array $docs The docs that we will use to generate the sample
*/
- public function initialize(array &$docs)
+ public function initialize(array &$docs): void
{
- $doc_keys = range(0,count($docs)-1);
- $topic_keys = range(0,$this->ntopics-1);
+ $doc_keys = range(0, count($docs) - 1);
+ $topic_keys = range(0, $this->ntopics - 1);
// initialize the arrays
$this->words_in_doc = array_fill_keys(
@@ -95,26 +112,28 @@ public function initialize(array &$docs)
);
$this->count_topics_words = array_fill_keys(
$topic_keys,
- array()
+ []
);
$this->word_doc_assigned_topic = array_fill_keys(
$doc_keys,
- array()
+ []
);
- $this->voc = array();
+ $this->voc = [];
- foreach ($docs as $i=>$doc) {
+ foreach ($docs as $i => $doc) {
$this->words_in_doc[$i] = count($doc);
- foreach ($doc as $idx=>$w) {
+ foreach ($doc as $idx => $w) {
// choose a topic randomly to assign this word to
- $topic = (int) ($this->mt->generate()*$this->ntopics);
+ $topic = (int) ($this->mt->generate() * $this->ntopics);
//$this->words_in_doc[$i]++;
$this->words_in_topic[$topic]++;
$this->count_docs_topics[$i][$topic]++;
- if (!isset($this->count_topics_words[$topic][$w]))
- $this->count_topics_words[$topic][$w]=0;
+ if (!isset($this->count_topics_words[$topic][$w])) {
+ $this->count_topics_words[$topic][$w] = 0;
+ }
+
$this->count_topics_words[$topic][$w]++;
$this->word_doc_assigned_topic[$i][$idx] = $topic;
@@ -122,38 +141,36 @@ public function initialize(array &$docs)
$this->voc[$w] = 1;
}
}
+
$this->voccnt = count($this->voc);
$this->voc = array_keys($this->voc);
}
/**
* Run the gibbs sampler $it times.
- *
- * @param TrainingSet The docs to run lda on
- * @param $it The number of iterations to run
*/
- public function train(TrainingSet $tset,$it)
+ public function train(TrainingSet $trainingSet, int $it): void
{
- $docs = $this->generateDocs($tset);
+ $docs = $this->generateDocs($trainingSet);
$this->initialize($docs);
while ($it-- > 0) {
$this->gibbsSample($docs);
}
- }
+ }
/**
* Generate one gibbs sample.
* The docs must have been passed to initialize previous to calling
* this function.
*
- * @param array $docs The docs that we will use to generate the sample
+ * @param array $docs The docs that we will use to generate the sample
*/
- public function gibbsSample(array &$docs)
+ public function gibbsSample(array &$docs): void
{
- foreach ($docs as $i=>$doc) {
- foreach ($doc as $idx=>$w) {
+ foreach ($docs as $i => $doc) {
+ foreach ($doc as $idx => $w) {
// remove word $w from the dataset
$topic = $this->word_doc_assigned_topic[$i][$idx];
$this->count_docs_topics[$i][$topic]--;
@@ -164,13 +181,15 @@ public function gibbsSample(array &$docs)
// recompute the probabilities of all topics and
// resample a topic for this word $w
- $p_topics = $this->conditionalDistribution($i,$w);
+ $p_topics = $this->conditionalDistribution($i, $w);
$topic = $this->drawIndex($p_topics);
// ---------------------------
// add word $w back into the dataset
- if (!isset($this->count_topics_words[$topic][$w]))
- $this->count_topics_words[$topic][$w]=0;
+ if (!isset($this->count_topics_words[$topic][$w])) {
+ $this->count_topics_words[$topic][$w] = 0;
+ }
+
$this->count_topics_words[$topic][$w]++;
$this->count_docs_topics[$i][$topic]++;
@@ -180,176 +199,180 @@ public function gibbsSample(array &$docs)
// ---------------------------
}
}
- }
+ }
/**
* Get the probability of a word given a topic (phi according to
* Griffiths and Steyvers)
*
- * @param $limit_words Limit the results to the top n words
- * @return array A two dimensional array that contains the probabilities for each topic
+ * @param int $limitWords Limit the results to the top n words
+ * @return array A two dimensional array that contains the probabilities for each topic
*/
- public function getWordsPerTopicsProbabilities($limit_words=-1)
+ public function getWordsPerTopicsProbabilities(int $limitWords = -1): array
{
$p_t_w = array_fill_keys(
- range(0,$this->ntopics-1),
- array()
+ range(0, $this->ntopics - 1),
+ []
);
- foreach ($p_t_w as $topic=>&$p) {
- $denom = $this->words_in_topic[$topic]+$this->voccnt*$this->b;
- foreach ($this->voc as $w) {
- if (isset($this->count_topics_words[$topic][$w]))
- $p[$w] = $this->count_topics_words[$topic][$w]+$this->b;
- else
- $p[$w] = $this->b;
- $p[$w] /= $denom;
- }
- if ($limit_words>0) {
- arsort($p);
- $p = array_slice($p,0,$limit_words,true); // true to preserve the keys
- }
- }
+ foreach ($p_t_w as $topic => &$p) {
+ $denom = $this->words_in_topic[$topic] + $this->voccnt * $this->b;
+ foreach ($this->voc as $w) {
+ $p[$w] = isset($this->count_topics_words[$topic][$w]) ? $this->count_topics_words[$topic][$w] + $this->b : $this->b;
+ $p[$w] /= $denom;
+ }
+
+ if ($limitWords > 0) {
+ arsort($p);
+ $p = array_slice($p, 0, $limitWords, true); // true to preserve the keys
+ }
+ }
return $p_t_w;
- }
+ }
/**
- * Shortcut to getWordsPerTopicsProbabilities
- */
- public function getPhi($limit_words=-1)
- {
- return $this->getWordsPerTopicsProbabilities($limit_words);
- }
+ * Shortcut to getWordsPerTopicsProbabilities
+ *
+ * @return array
+ */
+ public function getPhi(int $limitWords = -1): array
+ {
+ return $this->getWordsPerTopicsProbabilities($limitWords);
+ }
/**
* Get the probability of a document given a topic (theta according
* to Griffiths and Steyvers)
*
- * @param $limit_docs Limit the results to the top n docs
- * @return array A two dimensional array that contains the probabilities for each document
+ * @param int $limitDocs Limit the results to the top n docs
+ * @return array A two dimensional array that contains the probabilities for each document
*/
- public function getDocumentsPerTopicsProbabilities($limit_docs=-1)
- {
- $p_t_d = array_fill_keys(
- range(0,$this->ntopics-1),
- array()
- );
+ public function getDocumentsPerTopicsProbabilities(int $limitDocs = -1): array
+ {
+ $p_t_d = array_fill_keys(
+ range(0, $this->ntopics - 1),
+ []
+ );
- $doccnt = count($this->words_in_doc);
- $denom = $doccnt + $this->ntopics*$this->a;
- $count_topics_docs = array();
- foreach ($this->count_docs_topics as $doc=>$topics) {
- foreach ($topics as $t=>$c)
- $count_topics_docs[$doc][$t]++;
- }
-
- foreach ($p_t_d as $topic=>&$p) {
- foreach ($count_topics_docs as $doc=>$tc) {
- $p[$doc] = ($tc[$topic] + $this->a)/$denom;
- }
- if ($limit_words>0) {
- arsort($p);
- $p = array_slice($p,0,$limit_words,true); // true to preserve the keys
- }
- }
-
- return $p;
- }
+ $doccnt = count($this->words_in_doc);
+ $denom = $doccnt + $this->ntopics * $this->a;
+ $countTopicsDocs = [];
+ foreach ($this->count_docs_topics as $doc => $topics) {
+ foreach ($topics as $t => $c) {
+ $countTopicsDocs[$doc][$t]++;
+ }
+ }
+
+ foreach ($p_t_d as $topic => &$p) {
+ foreach ($countTopicsDocs as $doc => $tc) {
+ $p[$doc] = ($tc[$topic] + $this->a) / $denom;
+ }
+
+ if ($limitDocs > 0) {
+ arsort($p);
+ $p = array_slice($p, 0, $limitDocs, true); // true to preserve the keys
+ }
+ }
+
+ return $p ?? [];
+ }
/**
* Shortcut to getDocumentsPerTopicsProbabilities
+ *
+ * @return array
*/
- public function getTheta($limit_docs=-1)
- {
- return $this->getDocumentsPerTopicsProbabilities($limit_docs);
- }
+ public function getTheta(int $limitDocs = -1): array
+ {
+ return $this->getDocumentsPerTopicsProbabilities($limitDocs);
+ }
/**
* Log likelihood of the model having generated the data as
* implemented by M. Blondel
*/
- public function getLogLikelihood()
- {
- $voccnt = $this->voccnt;
- $lik = 0;
- $b = $this->b;
- $a = $this->a;
- foreach ($this->count_topics_words as $topic=>$words) {
- $lik += $this->log_multi_beta(
- $words,
+ public function getLogLikelihood(): int|float
+ {
+ $voccnt = $this->voccnt;
+ $lik = 0;
+ $b = $this->b;
+ $a = $this->a;
+ foreach ($this->count_topics_words as $count_topic_word) {
+ $lik += $this->logMultiBeta(
+ $count_topic_word,
$b
- );
- $lik -= $this->log_multi_beta(
+ );
+ $lik -= $this->logMultiBeta(
$b,
0,
$voccnt
- );
- }
- foreach ($this->count_docs_topics as $doc=>$topics) {
- $lik += $this->log_multi_beta(
- $topics,
+ );
+ }
+
+ foreach ($this->count_docs_topics as $count_doc_topic) {
+ $lik += $this->logMultiBeta(
+ $count_doc_topic,
$a
- );
- $lik -= $this->log_multi_beta(
+ );
+ $lik -= $this->logMultiBeta(
$a,
0,
$this->ntopics
- );
- }
+ );
+ }
- return $lik;
- }
+ return $lik;
+ }
/**
* This is the implementation of the equation number 5 in the paper
* by Griffiths and Steyvers.
*
- * @return array The vector of probabilites for all topics as computed by the equation 5
+ * @return array The vector of probabilites for all topics as computed by the equation 5
*/
- protected function conditionalDistribution($i,$w)
- {
- $p = array_fill_keys(range(0,$this->ntopics-1),0);
- for ($topic=0;$topic<$this->ntopics;$topic++) {
- if (isset($this->count_topics_words[$topic][$w]))
- $numerator = $this->count_topics_words[$topic][$w]+$this->b;
- else
- $numerator = $this->b;
-
- $numerator *= $this->count_docs_topics[$i][$topic]+$this->a;
-
- $denominator = $this->words_in_topic[$topic]+$this->voccnt*$this->b;
- $denominator *= $this->words_in_doc[$i]+$this->ntopics*$this->a;
-
- $p[$topic] = $numerator/$denominator;
- }
-
- // divide by sum to obtain probabilities
- $sum = array_sum($p);
-
- return array_map(
- function ($p) use ($sum) {
- return $p/$sum;
- },
+ protected function conditionalDistribution(int $i, mixed $w): array
+ {
+ $p = array_fill_keys(range(0, $this->ntopics - 1), 0);
+ for ($topic = 0; $topic < $this->ntopics; $topic++) {
+ $numerator = isset($this->count_topics_words[$topic][$w]) ? $this->count_topics_words[$topic][$w] + $this->b : $this->b;
+
+ $numerator *= $this->count_docs_topics[$i][$topic] + $this->a;
+
+ $denominator = $this->words_in_topic[$topic] + $this->voccnt * $this->b;
+ $denominator *= $this->words_in_doc[$i] + $this->ntopics * $this->a;
+
+ $p[$topic] = $numerator / $denominator;
+ }
+
+ // divide by sum to obtain probabilities
+ $sum = array_sum($p);
+
+ return array_map(
+ fn($p): float => $p / $sum,
$p
- );
- }
+ );
+ }
/**
* Draw once from a multinomial distribution and return the index
* of that is drawn.
*
- * @return int The index that was drawn.
+ * @param array $d
+ * @return int|null The index that was drawn.
*/
- protected function drawIndex(array $d)
- {
- $x = $this->mt->generate();
- $p = 0.0;
- foreach ($d as $i=>$v) {
- $p+=$v;
- if ($p > $x)
+ protected function drawIndex(array $d): int|null
+ {
+ $x = $this->mt->generate();
+ $p = 0.0;
+ foreach ($d as $i => $v) {
+ $p += $v;
+ if ($p > $x) {
return $i;
- }
- }
+ }
+ }
+
+ return null;
+ }
/**
* Gamma function from picomath.org
@@ -359,12 +382,13 @@ protected function drawIndex(array $d)
* TODO: These should probably move outside of NlpTools together
* with the Random namespace and form a nice php math library
*/
- private function gamma($x)
+ private function gamma(float $x): float
{
$gamma = 0.577215664901532860606512090; # Euler's gamma constant
if ($x < 0.001) {
- return 1.0/($x*(1.0 + $gamma*$x));
+ return 1.0 / ($x * (1.0 + $gamma * $x));
}
+
if ($x < 12.0) {
# The algorithm directly approximates gamma over (1,2) and uses
# reduction identities to reduce other arguments to this interval.
@@ -379,48 +403,32 @@ private function gamma($x)
$n = floor($y) - 1; # will use n later
$y -= $n;
}
+
# numerator coefficients for approximation over the interval (1,2)
$p =
- array(
- -1.71618513886549492533811E+0,
- 2.47656508055759199108314E+1,
- -3.79804256470945635097577E+2,
- 6.29331155312818442661052E+2,
- 8.66966202790413211295064E+2,
- -3.14512729688483675254357E+4,
- -3.61444134186911729807069E+4,
- 6.64561438202405440627855E+4
- );
+ [-1.71618513886549492533811E+0, 2.47656508055759199108314E+1, -3.79804256470945635097577E+2, 6.29331155312818442661052E+2, 8.66966202790413211295064E+2, -3.14512729688483675254357E+4, -3.61444134186911729807069E+4, 6.64561438202405440627855E+4];
# denominator coefficients for approximation over the interval (1,2)
$q =
- array(
- -3.08402300119738975254353E+1,
- 3.15350626979604161529144E+2,
- -1.01515636749021914166146E+3,
- -3.10777167157231109440444E+3,
- 2.25381184209801510330112E+4,
- 4.75584627752788110767815E+3,
- -1.34659959864969306392456E+5,
- -1.15132259675553483497211E+5
- );
+ [-3.08402300119738975254353E+1, 3.15350626979604161529144E+2, -1.01515636749021914166146E+3, -3.10777167157231109440444E+3, 2.25381184209801510330112E+4, 4.75584627752788110767815E+3, -1.34659959864969306392456E+5, -1.15132259675553483497211E+5];
$num = 0.0;
$den = 1.0;
$z = $y - 1;
for ($i = 0; $i < 8; $i++) {
- $num = ($num + $p[$i])*$z;
- $den = $den*$z + $q[$i];
+ $num = ($num + $p[$i]) * $z;
+ $den = $den * $z + $q[$i];
}
- $result = $num/$den + 1.0;
+
+ $result = $num / $den + 1.0;
# Apply correction if argument was not initially in (1,2)
if ($arg_was_less_than_one) {
# Use identity gamma(z) = gamma(z+1)/z
# The variable "result" now holds gamma of the original y + 1
# Thus we use y-1 to get back the orginal y.
- $result /= ($y-1.0);
+ $result /= ($y - 1.0);
} else {
# Use the identity gamma(z+n) = z*(z+1)* ... *(z+n-1)*gamma(z)
for ($i = 0; $i < $n; $i++) {
@@ -437,12 +445,13 @@ private function gamma($x)
if ($x > 171.624) {
# Correct answer too large to display.
- return Double.POSITIVE_INFINITY;
+ return PHP_FLOAT_MAX;
}
- return exp($this->log_gamma($x));
+ return exp($this->logGamma($x));
}
- private function log_gamma($x)
+
+ private function logGamma(float $x): float
{
if ($x < 12.0) {
return log(abs($this->gamma($x)));
@@ -454,58 +463,56 @@ private function log_gamma($x)
# A Course in Modern Analysis (1927), page 252
$c =
- array(
- 1.0/12.0,
- -1.0/360.0,
- 1.0/1260.0,
- -1.0/1680.0,
- 1.0/1188.0,
- -691.0/360360.0,
- 1.0/156.0,
- -3617.0/122400.0
- );
- $z = 1.0/($x*$x);
+ [1.0 / 12.0, -1.0 / 360.0, 1.0 / 1260.0, -1.0 / 1680.0, 1.0 / 1188.0, -691.0 / 360360.0, 1.0 / 156.0, -3617.0 / 122400.0];
+ $z = 1.0 / ($x * $x);
$sum = $c[7];
- for ($i=6; $i >= 0; $i--) {
+ for ($i = 6; $i >= 0; $i--) {
$sum *= $z;
$sum += $c[$i];
}
- $series = $sum/$x;
+
+ $series = $sum / $x;
$halfLogTwoPi = 0.91893853320467274178032973640562;
- $logGamma = ($x - 0.5)*log($x) - $x + $halfLogTwoPi + $series;
- return $logGamma;
+ return ($x - 0.5) * log($x) - $x + $halfLogTwoPi + $series;
}
- private function log_gamma_array($a)
+ /**
+ * @param array $a
+ * @return array
+ */
+ private function logGammaArray(array $a): array
{
- foreach ($a as &$x)
- $x = $this->log_gamma($x);
+ foreach ($a as &$x) {
+ $x = $this->logGamma($x);
+ }
return $a;
}
- private function log_multi_beta($a,$y=0,$k=null)
+
+ /**
+ * @param float|array $a
+ */
+ private function logMultiBeta(float|array $a, float|int $y = 0, ?float $k = null): float
{
- if ($k==null) {
+ if ($k === null) {
$ay = array_map(
- function ($x) use ($y) {
- return $x+$y;
- },
+ fn($x): float => $x + $y,
$a
);
return array_sum(
- $this->log_gamma_array(
+ $this->logGammaArray(
$ay
)
- )-$this->log_gamma(
+ ) - $this->logGamma(
array_sum(
$ay
)
);
- } else {
- return $k*$this->log_gamma($a) - $this->log_gamma($k*$a);
}
+
+ return $k * $this->logGamma($a) - $this->logGamma($k * $a);
}
}
diff --git a/src/NlpTools/Models/LinearModel.php b/src/NlpTools/Models/LinearModel.php
index 600b50c..b277357 100644
--- a/src/NlpTools/Models/LinearModel.php
+++ b/src/NlpTools/Models/LinearModel.php
@@ -1,5 +1,7 @@
$l
+ */
+ public function __construct(protected array $l)
{
- $this->l = $l;
}
+
/**
* Get the weight for a given feature
*
* @param string $feature The feature for which the weight will be returned
* @return float The weight
*/
- public function getWeight($feature)
+ public function getWeight(string $feature): float
{
- if (!isset($this->l[$feature])) return 0;
- else return $this->l[$feature];
+ if (!isset($this->l[$feature])) {
+ return 0;
+ }
+
+ return $this->l[$feature];
}
/**
* Get all the weights as an array.
*
- * @return array The weights as an associative array
+ * @return array The weights as an associative array
*/
- public function getWeights()
+ public function getWeights(): array
{
return $this->l;
}
diff --git a/src/NlpTools/Models/Maxent.php b/src/NlpTools/Models/Maxent.php
index 80f9dc1..091df1b 100644
--- a/src/NlpTools/Models/Maxent.php
+++ b/src/NlpTools/Models/Maxent.php
@@ -1,10 +1,13 @@
getClassSet();
+ $classSet = $trainingSet->getClassSet();
- $features = $this->calculateFeatureArray($classSet,$tset,$ff);
- $this->l = $opt->optimize($features);
+ $features = $this->calculateFeatureArray($classSet, $trainingSet, $featureFactory);
+ $this->l = $maxentOptimizer->optimize($features);
}
/**
@@ -44,20 +42,19 @@ public function train(FeatureFactoryInterface $ff, TrainingSet $tset, MaxentOpti
* because we want to be able to optimize externally to
* gain speed (PHP is slow!).
*
- * @param $classes A set of the classes in the training set
- * @param $tset A collection of training documents
- * @param $ff The feature factory
- * @return array An array that contains every feature for every possible class of every document
+ * @param array $classes
+ * @return array
*/
- protected function calculateFeatureArray(array $classes, TrainingSet $tset, FeatureFactoryInterface $ff)
+ protected function calculateFeatureArray(array $classes, TrainingSet $trainingSet, FeatureFactoryInterface $featureFactory): array
{
- $features = array();
- $tset->setAsKey(TrainingSet::OFFSET_AS_KEY);
- foreach ($tset as $offset=>$doc) {
- $features[$offset] = array();
+ $features = [];
+ $trainingSet->setAsKey(TrainingSet::OFFSET_AS_KEY);
+ foreach ($trainingSet as $offset => $doc) {
+ $features[$offset] = [];
foreach ($classes as $class) {
- $features[$offset][$class] = $ff->getFeatureArray($class,$doc);
+ $features[$offset][$class] = $featureFactory->getFeatureArray($class, $doc);
}
+
$features[$offset]['__label__'] = $doc->getClass();
}
@@ -69,45 +66,20 @@ protected function calculateFeatureArray(array $classes, TrainingSet $tset, Feat
* $class given a set of possible classes, a feature factory and
* the model's weights l[i]
*
- * @param $classes The set of possible classes
- * @param $ff The feature factory
- * @param $d The document
- * @param string $class A class for which we calculate the probability
- * @return float The probability that document $d belongs to class $class
+ * @param array $classes
*/
- public function P(array $classes,FeatureFactoryInterface $ff,DocumentInterface $d,$class)
+ public function calculateProbability(array $classes, FeatureFactoryInterface $featureFactory, DocumentInterface $document, string $class): float
{
- $exps = array();
+ $exps = [];
foreach ($classes as $cl) {
$tmp = 0.0;
- foreach ($ff->getFeatureArray($cl,$d) as $i) {
+ foreach ($featureFactory->getFeatureArray($cl, $document) as $i) {
$tmp += $this->l[$i];
}
+
$exps[$cl] = exp($tmp);
}
- return $exps[$class]/array_sum($exps);
- }
-
- /**
- * Not implemented yet.
- * Simply put:
- * result += log( $this->P(..., ..., ...) ) for every doc in TrainingSet
- *
- * @throws \Exception
- */
- public function CLogLik(TrainingSet $tset,FeatureFactoryInterface $ff)
- {
- throw new \Exception("Unimplemented");
- }
-
- /**
- * Simply print_r weights. Usefull for some kind of debugging when
- * working with small training sets and few features
- */
- public function dumpWeights()
- {
- print_r($this->l);
+ return $exps[$class] / array_sum($exps);
}
-
}
diff --git a/src/NlpTools/Models/MultinomialNBModelInterface.php b/src/NlpTools/Models/MultinomialNBModelInterface.php
index 149730c..f27b786 100644
--- a/src/NlpTools/Models/MultinomialNBModelInterface.php
+++ b/src/NlpTools/Models/MultinomialNBModelInterface.php
@@ -1,5 +1,7 @@
optimizer = $optimizer;
}
/**
* Open a pipe to the optimizer, send him the data encoded in json
* and then read the stdout to get the results encoded in json
*
- * @param array $feature_array The features that fired for any document for any class @see NlpTools\Models\Maxent
- * @return array The optimized weights
+ * @param array $feature_array The features that fired for any document for any class @see NlpTools\Models\Maxent
+ * @return array The optimized weights
*/
- public function optimize(array &$feature_array)
+ public function optimize(array &$feature_array): array
{
// whete we will read from where we will write to
- $desrciptorspec = array(
- 0=>array('pipe','r'),
- 1=>array('pipe','w'),
- 2=>STDERR // Should that be redirected to /dev/null or like?
- );
+ $desrciptorspec = [0 => ['pipe', 'r'], 1 => ['pipe', 'w'], 2 => STDERR];
// Run the optimizer
- $process = proc_open($this->optimizer,$desrciptorspec,$pipes);
+ $process = proc_open($this->optimizer, $desrciptorspec, $pipes);
if (!is_resource($process)) {
- return array();
+ return [];
}
// send the data
- fwrite($pipes[0],json_encode($feature_array));
+ fwrite($pipes[0], json_encode($feature_array));
fclose($pipes[0]);
// get the weights
$json = stream_get_contents($pipes[1]);
// decode as an associative array
- $l = json_decode( $json , true );
+ $l = json_decode($json, true);
// close up the optimizer
fclose($pipes[1]);
@@ -91,5 +85,4 @@ public function optimize(array &$feature_array)
return $l;
}
-
}
diff --git a/src/NlpTools/Optimizers/FeatureBasedLinearOptimizerInterface.php b/src/NlpTools/Optimizers/FeatureBasedLinearOptimizerInterface.php
index d307c9d..a46d73c 100644
--- a/src/NlpTools/Optimizers/FeatureBasedLinearOptimizerInterface.php
+++ b/src/NlpTools/Optimizers/FeatureBasedLinearOptimizerInterface.php
@@ -1,5 +1,7 @@
$featureArray
+ * @return array
*/
- public function optimize(array &$feature_array);
+ public function optimize(array &$featureArray): array;
}
diff --git a/src/NlpTools/Optimizers/GradientDescentOptimizer.php b/src/NlpTools/Optimizers/GradientDescentOptimizer.php
index ea7d399..3890db0 100644
--- a/src/NlpTools/Optimizers/GradientDescentOptimizer.php
+++ b/src/NlpTools/Optimizers/GradientDescentOptimizer.php
@@ -1,5 +1,7 @@
+ */
+ protected array $fprimeVector;
// report the improvement
- protected $verbose=2;
+ protected int $verbose = 2;
- public function __construct($precision=0.001, $step=0.1, $maxiter = -1)
+ public function __construct(protected float $precision = 0.001, protected float $step = 0.1, protected int $maxiter = -1)
{
- $this->precision = $precision;
- $this->step = $step;
- $this->maxiter = $maxiter;
}
/**
* Should initialize the weights and compute any constant
* expressions needed for the fprime calculation.
*
- * @param $feature_array All the data known about the training set
- * @param $l The current set of weights to be initialized
- * @return void
+ * @param array $featureArray All the data known about the training set
+ * @param array $l The current set of weights to be initialized
*/
- abstract protected function initParameters(array &$feature_array, array &$l);
+ abstract protected function initParameters(array &$featureArray, array &$l): void;
+
/**
* Should calculate any parameter needed by Fprime that cannot be
* calculated by initParameters because it is not constant.
*
- * @param $feature_array All the data known about the training set
- * @param $l The current set of weights to be initialized
- * @return void
+ * @param array $featureArray All the data known about the training set
+ * @param array $l The current set of weights to be initialized
*/
- abstract protected function prepareFprime(array &$feature_array, array &$l);
+ abstract protected function prepareFprime(array &$featureArray, array &$l): void;
+
/**
* Actually compute the fprime_vector. Set for each $l[$i] the
* value of the partial derivative of f for delta $l[$i]
*
- * @param $feature_array All the data known about the training set
- * @param $l The current set of weights to be initialized
- * @return void
+ * @param array $featureArray All the data known about the training set
+ * @param array $l The current set of weights to be initialized
*/
- abstract protected function Fprime(array &$feature_array, array &$l);
+ abstract protected function fPrime(array &$featureArray, array &$l): void;
/**
* Actually do the gradient descent algorithm.
* l[i] = l[i] - learning_rate*( theta f/delta l[i] ) for each i
* Could possibly benefit from a vetor add/scale function.
*
- * @param $feature_array All the data known about the training set
- * @return array The parameters $l[$i] that minimize F
+ * @param array $featureArray All the data known about the training set
+ * @return array The parameters $l[$i] that minimize F
*/
- public function optimize(array &$feature_array)
+ public function optimize(array &$featureArray): array
{
$itercount = 0;
$optimized = false;
$maxiter = $this->maxiter;
$prec = $this->precision;
$step = $this->step;
- $l = array();
- $this->initParameters($feature_array,$l);
- while (!$optimized && $itercount++!=$maxiter) {
+ $l = [];
+ $this->initParameters($featureArray, $l);
+ while (!$optimized && $itercount++ != $maxiter) {
//$start = microtime(true);
$optimized = true;
- $this->prepareFprime($feature_array,$l);
- $this->Fprime($feature_array,$l);
- foreach ($this->fprime_vector as $i=>$fprime_i_val) {
- $l[$i] -= $step*$fprime_i_val;
+ $this->prepareFprime($featureArray, $l);
+ $this->fPrime($featureArray, $l);
+ foreach ($this->fprimeVector as $i => $fprime_i_val) {
+ $l[$i] -= $step * $fprime_i_val;
if (abs($fprime_i_val) > $prec) {
$optimized = false;
}
}
- //fprintf(STDERR,"%f\n",microtime(true)-$start);
- if ($this->verbose>0)
+
+ if ($this->verbose > 0) {
$this->reportProgress($itercount);
+ }
}
return $l;
}
- public function reportProgress($itercount)
+ public function reportProgress(int $iterCount): void
{
- if ($itercount == 1) {
+ if ($iterCount === 1) {
echo "#\t|Fprime|\n------------------\n";
}
+
$norm = 0;
- foreach ($this->fprime_vector as $fprime_i_val) {
- $norm += $fprime_i_val*$fprime_i_val;
+ foreach ($this->fprimeVector as $fprimeIval) {
+ $norm += $fprimeIval * $fprimeIval;
}
+
$norm = sqrt($norm);
- printf("%d\t%.3f\n",$itercount,$norm);
+ printf("%d\t%.3f\n", $iterCount, $norm);
}
}
diff --git a/src/NlpTools/Optimizers/MaxentGradientDescent.php b/src/NlpTools/Optimizers/MaxentGradientDescent.php
index 4890c29..4b601ac 100644
--- a/src/NlpTools/Optimizers/MaxentGradientDescent.php
+++ b/src/NlpTools/Optimizers/MaxentGradientDescent.php
@@ -1,5 +1,7 @@
+ */
+ protected array $numerators;
+
+ /**
+ * denominators will be computed on each iteration because they
+ * depend on the weights
+ *
+ * @var array
+ */
+ protected array $denominators;
/**
* We initialize all weight for any feature we find to 0. We also
@@ -23,25 +34,28 @@ class MaxentGradientDescent extends GradientDescentOptimizer implements MaxentOp
* the training data (which of course remains constant for a
* specific set of data).
*
- * @param $feature_array All the data known about the training set
- * @param $l The current set of weights to be initialized
- * @return void
+ * @param array $featureArray All the data known about the training set
+ * @param array $l The current set of weights to be initialized
*/
- protected function initParameters(array &$feature_array, array &$l)
+ protected function initParameters(array &$featureArray, array &$l): void
{
- $this->numerators = array();
- $this->fprime_vector = array();
- foreach ($feature_array as $doc) {
- foreach ($doc as $class=>$features) {
- if (!is_array($features)) continue;
- foreach ($features as $fi) {
- $l[$fi] = 0;
- $this->fprime_vector[$fi] = 0;
- if (!isset($this->numerators[$fi])) {
- $this->numerators[$fi] = 0;
+ $this->numerators = [];
+ $this->fprimeVector = [];
+ foreach ($featureArray as $doc) {
+ foreach ($doc as $features) {
+ if (!is_array($features)) {
+ continue;
+ }
+
+ foreach ($features as $feature) {
+ $l[$feature] = 0;
+ $this->fprimeVector[$feature] = 0;
+ if (!isset($this->numerators[$feature])) {
+ $this->numerators[$feature] = 0;
}
}
}
+
foreach ($doc[$doc['__label__']] as $fi) {
$this->numerators[$fi]++;
}
@@ -53,33 +67,41 @@ protected function initParameters(array &$feature_array, array &$l)
* each feature given a set of weights L and a set of features for
* each document for each class.
*
- * @param $feature_array All the data known about the training set
- * @param $l The current set of weights to be initialized
- * @return void
+ * @param array $featureArray All the data known about the training set
+ * @param array $l The current set of weights to be initialized
*/
- protected function prepareFprime(array &$feature_array, array &$l)
+ protected function prepareFprime(array &$featureArray, array &$l): void
{
- $this->denominators = array();
- foreach ($feature_array as $offset=>$doc) {
- $numerator = array_fill_keys(array_keys($doc),0.0);
+ $this->denominators = [];
+ foreach ($featureArray as $doc) {
+ $numerator = array_fill_keys(array_keys($doc), 0.0);
$denominator = 0.0;
- foreach ($doc as $cl=>$f) {
- if (!is_array($f)) continue;
+ foreach ($doc as $cl => $f) {
+ if (!is_array($f)) {
+ continue;
+ }
+
$tmp = 0.0;
foreach ($f as $i) {
$tmp += $l[$i];
}
+
$tmp = exp($tmp);
$numerator[$cl] += $tmp;
$denominator += $tmp;
}
- foreach ($doc as $class=>$features) {
- if (!is_array($features)) continue;
- foreach ($features as $fi) {
- if (!isset($this->denominators[$fi])) {
- $this->denominators[$fi] = 0;
+
+ foreach ($doc as $class => $features) {
+ if (!is_array($features)) {
+ continue;
+ }
+
+ foreach ($features as $feature) {
+ if (!isset($this->denominators[$feature])) {
+ $this->denominators[$feature] = 0;
}
- $this->denominators[$fi] += $numerator[$class]/$denominator;
+
+ $this->denominators[$feature] += $numerator[$class] / $denominator;
}
}
}
@@ -93,15 +115,13 @@ protected function prepareFprime(array &$feature_array, array &$l)
*
* See page 28 of http://nlp.stanford.edu/pubs/maxent-tutorial-slides.pdf
*
- * @param $feature_array All the data known about the training set
- * @param $l The current set of weights to be initialized
- * @return void
+ * @param array $featureArray All the data known about the training set
+ * @param array $l The current set of weights to be initialized
*/
- protected function Fprime(array &$feature_array, array &$l)
+ protected function fPrime(array &$featureArray, array &$l): void
{
- foreach ($this->fprime_vector as $i=>&$fprime_i_val) {
+ foreach ($this->fprimeVector as $i => &$fprime_i_val) {
$fprime_i_val = $this->denominators[$i] - $this->numerators[$i];
}
}
-
}
diff --git a/src/NlpTools/Optimizers/MaxentOptimizerInterface.php b/src/NlpTools/Optimizers/MaxentOptimizerInterface.php
index 626508a..112816b 100644
--- a/src/NlpTools/Optimizers/MaxentOptimizerInterface.php
+++ b/src/NlpTools/Optimizers/MaxentOptimizerInterface.php
@@ -1,8 +1,12 @@
rnd = MersenneTwister::get();
- else
- $this->rnd = $rnd;
+ $this->rnd = $generator ?? MersenneTwister::get();
}
- abstract public function sample();
+ abstract public function sample(): mixed;
}
diff --git a/src/NlpTools/Random/Distributions/Dirichlet.php b/src/NlpTools/Random/Distributions/Dirichlet.php
index 7f5e137..c3df8ca 100644
--- a/src/NlpTools/Random/Distributions/Dirichlet.php
+++ b/src/NlpTools/Random/Distributions/Dirichlet.php
@@ -1,7 +1,10 @@
+ */
+ protected array $gamma;
- public function __construct($a,$k,GeneratorInterface $rnd=null)
+ public function __construct(mixed $a, float $k, GeneratorInterface $generator = null)
{
- parent::__construct($rnd);
+ parent::__construct($generator);
$k = (int) abs($k);
if (!is_array($a)) {
- $a = array_fill_keys(range(0,$k-1),$a);
+ $a = array_fill_keys(range(0, $k - 1), $a);
}
- $rnd = $this->rnd;
+ $generator = $this->rnd;
$this->gamma = array_map(
- function ($a) use ($rnd) {
- return new Gamma($a,1,$rnd);
- },
+ fn($a): Gamma => new Gamma($a, 1, $generator),
$a
);
}
- public function sample()
+ /**
+ * @return array
+ */
+ public function sample(): array
{
- $y = array();
+ $y = [];
+ /** @var Gamma $g */
foreach ($this->gamma as $g) {
$y[] = $g->sample();
}
+
$sum = array_sum($y);
return array_map(
- function ($y) use ($sum) {
- return $y/$sum;
- },
+ fn($y): float => $y / $sum,
$y
);
}
diff --git a/src/NlpTools/Random/Distributions/Gamma.php b/src/NlpTools/Random/Distributions/Gamma.php
index 38f5a0b..9536842 100644
--- a/src/NlpTools/Random/Distributions/Gamma.php
+++ b/src/NlpTools/Random/Distributions/Gamma.php
@@ -1,8 +1,11 @@
scale = $scale;
+ public function __construct(float $shape, protected float $scale, GeneratorInterface $generator = null)
+ {
+ parent::__construct($generator);
$this->shape = abs($shape);
- if ($this->shape >= 1)
- $this->normal = new Normal(0,1,$this->rnd);
- else
+ if ($this->shape >= 1) {
+ $this->normal = new Normal(0, 1, $this->rnd);
+ } else {
$this->gamma = new Gamma($this->shape + 1, 1, $this->rnd);
-
+ }
}
- public function sample()
+ public function sample(): ?float
{
if ($this->shape >= 1) {
- $d = $this->shape - 1/3;
- $c = 1/sqrt(9*$d);
+ $d = $this->shape - 1 / 3;
+ $c = 1 / sqrt(9 * $d);
for (;;) {
do {
$x = $this->normal->sample();
- $v = 1 + $c*$x;
+ $v = 1 + $c * $x;
} while ($v <= 0);
- $v = $v*$v*$v;
+
+ $v = $v * $v * $v;
$u = $this->rnd->generate();
- $xsq = $x*$x;
- if ($u < 1-.0331*$xsq*$xsq || log($u) < 0.5*$xsq + $d*(1-$v+log($v)))
- return $this->scale*$d*$v;
+ $xsq = $x * $x;
+ if ($u < 1 - .0331 * $xsq * $xsq || log($u) < 0.5 * $xsq + $d * (1 - $v + log($v))) {
+ return $this->scale * $d * $v;
+ }
}
} else {
$g = $this->gamma->sample();
$w = $this->rnd->generate();
- return $this->scale*$g*pow($w,1/$this->shape);
+ return $this->scale * $g * $w ** (1 / $this->shape);
}
+
+ return null;
}
}
diff --git a/src/NlpTools/Random/Distributions/Normal.php b/src/NlpTools/Random/Distributions/Normal.php
index d3b9f37..d4b011d 100644
--- a/src/NlpTools/Random/Distributions/Normal.php
+++ b/src/NlpTools/Random/Distributions/Normal.php
@@ -1,29 +1,26 @@
m = $m;
+ parent::__construct($generator);
$this->sigma = abs($sigma);
}
- public function sample()
+ public function sample(): float
{
$u1 = $this->rnd->generate();
$u2 = $this->rnd->generate();
- $r = sqrt(-2*log($u1));
- $theta = 2.0*M_PI*$u2;
+ $r = sqrt(-2 * log($u1));
+ $theta = 2.0 * M_PI * $u2;
- return $this->m + $this->sigma*$r*sin($theta);
+ return $this->m + $this->sigma * $r * sin($theta);
}
}
diff --git a/src/NlpTools/Random/Generators/FromFile.php b/src/NlpTools/Random/Generators/FromFile.php
index a585151..bca403f 100644
--- a/src/NlpTools/Random/Generators/FromFile.php
+++ b/src/NlpTools/Random/Generators/FromFile.php
@@ -1,5 +1,7 @@
h = fopen($f,'r');
+ $this->handle = fopen($f, 'r');
}
/**
@@ -29,11 +31,12 @@ public function __construct($f)
*
* @return float A random float in the range (0,1)
*/
- public function generate()
+ public function generate(): float
{
- if (feof($this->h))
- rewind($this->h);
+ if (feof($this->handle)) {
+ rewind($this->handle);
+ }
- return (float) fgets($this->h);
+ return (float) fgets($this->handle);
}
}
diff --git a/src/NlpTools/Random/Generators/GeneratorInterface.php b/src/NlpTools/Random/Generators/GeneratorInterface.php
index ca6774c..4d6fc62 100644
--- a/src/NlpTools/Random/Generators/GeneratorInterface.php
+++ b/src/NlpTools/Random/Generators/GeneratorInterface.php
@@ -1,5 +1,7 @@
1,
- * 'feature_2'=>0.55,
- * 'feature_3'=>12.7,
- * ....
+ * 'feature_1'=>1,
+ * 'feature_2'=>0.55,
+ * 'feature_3'=>12.7,
+ * ....
* )
*/
class CosineSimilarity implements SimilarityInterface, DistanceInterface
{
-
/**
* Returns a number between 0,1 that corresponds to the cos(theta)
* where theta is the angle between the two sets if they are treated
@@ -36,56 +37,62 @@ class CosineSimilarity implements SimilarityInterface, DistanceInterface
* See the class comment about why the number is in [0,1] and not
* in [-1,1] as it normally should.
*
- * @param array $A Either feature vector or simply vector
- * @param array $B Either feature vector or simply vector
+ * @param array $a Either feature vector or simply vector
+ * @param array $b Either feature vector or simply vector
* @return float The cosinus of the angle between the two vectors
*/
- public function similarity(&$A, &$B)
+ public function similarity(array &$a, array &$b): float
{
-
- if (!is_array($A) || !is_array($B)) {
- throw new \InvalidArgumentException('Vector $' . (!is_array($A) ? 'A' : 'B') . ' is not an array');
- }
-
// This means they are simple text vectors
// so we need to count to make them vectors
- if (is_int(key($A)))
- $v1 = array_count_values($A);
- else
- $v1 = &$A;
- if (is_int(key($B)))
- $v2 = array_count_values($B);
- else
- $v2 = &$B;
+ if (is_int(key($a))) {
+ $v1 = array_count_values($a);
+ } else {
+ $v1 = &$a;
+ }
+
+ if (is_int(key($b))) {
+ $v2 = array_count_values($b);
+ } else {
+ $v2 = &$b;
+ }
$prod = 0.0;
$v1_norm = 0.0;
- foreach ($v1 as $i=>$xi) {
+ foreach ($v1 as $i => $xi) {
if (isset($v2[$i])) {
- $prod += $xi*$v2[$i];
+ $prod += $xi * $v2[$i];
}
- $v1_norm += $xi*$xi;
+
+ $v1_norm += $xi * $xi;
}
+
$v1_norm = sqrt($v1_norm);
- if ($v1_norm==0)
+ if ($v1_norm == 0) {
throw new \InvalidArgumentException("Vector \$A is the zero vector");
+ }
$v2_norm = 0.0;
- foreach ($v2 as $i=>$xi) {
- $v2_norm += $xi*$xi;
+ foreach ($v2 as $xi) {
+ $v2_norm += $xi * $xi;
}
+
$v2_norm = sqrt($v2_norm);
- if ($v2_norm==0)
+ if ($v2_norm == 0) {
throw new \InvalidArgumentException("Vector \$B is the zero vector");
+ }
- return $prod/($v1_norm*$v2_norm);
+ return $prod / ($v1_norm * $v2_norm);
}
/**
* Cosine distance is simply 1-cosine similarity
+ *
+ * @param array $a Either feature vector or simply vector
+ * @param array $b Either feature vector or simply vector
*/
- public function dist(&$A, &$B)
+ public function dist(array &$a, array &$b): float
{
- return 1-$this->similarity($A,$B);
+ return 1 - $this->similarity($a, $b);
}
}
diff --git a/src/NlpTools/Similarity/DiceSimilarity.php b/src/NlpTools/Similarity/DiceSimilarity.php
index e34e497..a113794 100644
--- a/src/NlpTools/Similarity/DiceSimilarity.php
+++ b/src/NlpTools/Similarity/DiceSimilarity.php
@@ -1,5 +1,7 @@
$a Either feature vector or simply vector
+ * @param array $b Either feature vector or simply vector
+ */
+ public function similarity(array &$a, array &$b): float
{
+ $aa = array_fill_keys($a, 1);
+ $bb = array_fill_keys($b, 1);
+ $intersect = count(array_intersect_key($aa, $bb));
+ $aCount = count($aa);
+ $bCount = count($bb);
- $a = array_fill_keys($A,1);
- $b = array_fill_keys($B,1);
-
- $intersect = count(array_intersect_key($a,$b));
- $a_count = count($a);
- $b_count = count($b);
-
- return (2*$intersect)/($a_count + $b_count);
+ return (2 * $intersect) / ($aCount + $bCount);
}
- public function dist(&$A, &$B)
+ /**
+ * @param array $a Either feature vector or simply vector
+ * @param array $b Either feature vector or simply vector
+ */
+ public function dist(array &$a, array &$b): float
{
- return 1-$this->similarity($A,$B);
+ return 1 - $this->similarity($a, $b);
}
-}
\ No newline at end of file
+}
diff --git a/src/NlpTools/Similarity/DistanceInterface.php b/src/NlpTools/Similarity/DistanceInterface.php
index 3aaae28..7c73ab5 100644
--- a/src/NlpTools/Similarity/DistanceInterface.php
+++ b/src/NlpTools/Similarity/DistanceInterface.php
@@ -1,5 +1,7 @@
$a Either feature vector or simply vector
+ * @param array $b Either feature vector or simply vector
+ */
+ public function dist(array &$a, array &$b): float;
}
diff --git a/src/NlpTools/Similarity/Euclidean.php b/src/NlpTools/Similarity/Euclidean.php
index 252faf6..b1fd5bc 100644
--- a/src/NlpTools/Similarity/Euclidean.php
+++ b/src/NlpTools/Similarity/Euclidean.php
@@ -1,5 +1,7 @@
$a Either a vector or a collection of tokens to be transformed to a vector
+ * @param array $b Either a vector or a collection of tokens to be transformed to a vector
* @return float The euclidean distance between $A and $B
*/
- public function dist(&$A, &$B)
+ public function dist(array &$a, array &$b): float
{
- if (is_int(key($A)))
- $v1 = array_count_values($A);
- else
- $v1 = &$A;
- if (is_int(key($B)))
- $v2 = array_count_values($B);
- else
- $v2 = &$B;
+ if (is_int(key($a))) {
+ $v1 = array_count_values($a);
+ } else {
+ $v1 = &$a;
+ }
- $r = array();
- foreach ($v1 as $k=>$v) {
+ if (is_int(key($b))) {
+ $v2 = array_count_values($b);
+ } else {
+ $v2 = &$b;
+ }
+
+ $r = [];
+ foreach ($v1 as $k => $v) {
$r[$k] = $v;
}
- foreach ($v2 as $k=>$v) {
- if (isset($r[$k]))
+
+ foreach ($v2 as $k => $v) {
+ if (isset($r[$k])) {
$r[$k] -= $v;
- else
+ } else {
$r[$k] = $v;
+ }
}
return sqrt(
array_sum(
array_map(
- function ($x) {
- return $x*$x;
- },
+ fn($x): int|float => $x * $x,
$r
)
)
diff --git a/src/NlpTools/Similarity/HammingDistance.php b/src/NlpTools/Similarity/HammingDistance.php
index bf67987..d017791 100644
--- a/src/NlpTools/Similarity/HammingDistance.php
+++ b/src/NlpTools/Similarity/HammingDistance.php
@@ -1,5 +1,7 @@
$a Either a vector or a collection of tokens to be transformed to a vector
+ * @param array $b Either a vector or a collection of tokens to be transformed to a vector
*/
- public function dist(&$A, &$B)
+ public function dist(array &$a, array &$b): float
{
- $l1 = strlen($A);
- $l2 = strlen($B);
- $l = min($l1,$l2);
+ $aa = $a[0];
+ $bb = $b[0];
+
+ $l1 = strlen((string) $aa);
+ $l2 = strlen((string) $bb);
+ $l = min($l1, $l2);
$d = 0;
- for ($i=0;$i<$l;$i++) {
- $d += (int) ($A[$i]!=$B[$i]);
+ for ($i = 0; $i < $l; $i++) {
+ $d += (int) ($aa[$i] !== $bb[$i]);
}
- return $d + (int) abs($l1-$l2);
+ return $d + (int) abs($l1 - $l2);
}
}
diff --git a/src/NlpTools/Similarity/JaccardIndex.php b/src/NlpTools/Similarity/JaccardIndex.php
index c97280b..12e5501 100644
--- a/src/NlpTools/Similarity/JaccardIndex.php
+++ b/src/NlpTools/Similarity/JaccardIndex.php
@@ -1,5 +1,7 @@
$a Either a vector or a collection of tokens to be transformed to a vector
+ * @param array $b Either a vector or a collection of tokens to be transformed to a vector
*/
- public function similarity(&$A, &$B)
+ public function similarity(array &$a, array &$b): float
{
- $a = array_fill_keys($A,1);
- $b = array_fill_keys($B,1);
+ $aa = array_fill_keys($a, 1);
+ $bb = array_fill_keys($b, 1);
- $intersect = count(array_intersect_key($a,$b));
- $union = count(array_fill_keys(array_merge($A,$B),1));
+ $intersect = count(array_intersect_key($aa, $bb));
+ $union = count(array_fill_keys(array_merge($a, $b), 1));
- return $intersect/$union;
+ return $intersect / $union;
}
/**
* Jaccard Distance is simply the complement of the jaccard similarity
+ *
+ * @param array $a Either a vector or a collection of tokens to be transformed to a vector
+ * @param array $b Either a vector or a collection of tokens to be transformed to a vector
*/
- public function dist(&$A, &$B)
+ public function dist(array &$a, array &$b): float
{
- return 1-$this->similarity($A,$B);
+ return 1 - $this->similarity($a, $b);
}
-
}
diff --git a/src/NlpTools/Similarity/OverlapCoefficient.php b/src/NlpTools/Similarity/OverlapCoefficient.php
index 13ab891..24acb3d 100644
--- a/src/NlpTools/Similarity/OverlapCoefficient.php
+++ b/src/NlpTools/Similarity/OverlapCoefficient.php
@@ -1,5 +1,7 @@
similarity($A,$B);
+ return 1 - $this->similarity($a, $b);
}
}
diff --git a/src/NlpTools/Similarity/Simhash.php b/src/NlpTools/Similarity/Simhash.php
index 2f94729..473eeb2 100644
--- a/src/NlpTools/Similarity/Simhash.php
+++ b/src/NlpTools/Similarity/Simhash.php
@@ -1,5 +1,7 @@
+ */
+ protected static array $search = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'];
- // $h is a hash function that returns a string of 1,0
- // corresponding to the bits of the hash
- protected $h;
+ /**
+ * @var array
+ */
+ protected static array $replace = ['0000', '0001', '0010', '0011', '0100', '0101', '0110', '0111', '1000', '1001', '1010', '1011', '1100', '1101', '1110', '1111'];
- // This is the default hash function used to hash
- // the members of the sets (it is just a wrapper over md5)
- protected static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f');
- protected static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111');
- protected static function md5($w)
+ protected static function md5(string $w): string
{
- return str_replace(self::$search,self::$replace,md5($w));
+ return str_replace(self::$search, self::$replace, md5($w));
}
/**
- * @param integer $len The length of the simhash in bits
- * @param callable $hash The hash function to compute the hashes of the features
+ * @param callable $h
*/
- public function __construct($len,$hash='self::md5')
+ public function __construct(protected int $length, protected $h = [self::class, 'md5'])
{
- $this->length = $len;
- $this->h = $hash;
}
/**
@@ -56,28 +56,32 @@ public function __construct($len,$hash='self::md5')
* 1. Each feature has a weight of 1, but feature duplication is
* allowed.
*
- * @param array $set
+ * @param array $set
* @return string The bits of the hash as a string
- * */
- public function simhash(array &$set)
+ */
+ public function simhash(array &$set): string
{
- $boxes = array_fill(0,$this->length,0);
- if (is_int(key($set)))
+ $boxes = array_fill(0, $this->length, 0);
+ if (is_int(key($set))) {
$dict = array_count_values($set);
- else
+ } else {
$dict = &$set;
- foreach ($dict as $m=>$w) {
- $h = call_user_func($this->h,$m);
- for ($bit_idx=0;$bit_idx<$this->length;$bit_idx++) {
- $boxes[$bit_idx] += ($h[$bit_idx]=='1') ? $w : -$w;
+ }
+
+ foreach ($dict as $m => $w) {
+ $h = call_user_func($this->h, $m);
+ for ($bit_idx = 0; $bit_idx < $this->length; $bit_idx++) {
+ $boxes[$bit_idx] += ($h[$bit_idx] == '1') ? $w : -$w;
}
}
+
$s = '';
foreach ($boxes as $box) {
- if ($box>0)
+ if ($box > 0) {
$s .= '1';
- else
+ } else {
$s .= '0';
+ }
}
return $s;
@@ -86,18 +90,18 @@ public function simhash(array &$set)
/**
* Computes the hamming distance of the simhashes of two sets.
*
- * @param array $A
- * @param array $B
- * @return int [0,$this->length]
+ * @param array $a Either a vector or a collection of tokens to be transformed to a vector
+ * @param array $b Either a vector or a collection of tokens to be transformed to a vector
*/
- public function dist(&$A, &$B)
+ public function dist(array &$a, array &$b): float
{
- $h1 = $this->simhash($A);
- $h2 = $this->simhash($B);
+ $h1 = $this->simhash($a);
+ $h2 = $this->simhash($b);
$d = 0;
- for ($i=0;$i<$this->length;$i++) {
- if ($h1[$i]!=$h2[$i])
+ for ($i = 0; $i < $this->length; $i++) {
+ if ($h1[$i] !== $h2[$i]) {
$d++;
+ }
}
return $d;
@@ -107,13 +111,11 @@ public function dist(&$A, &$B)
* Computes a similarity measure from two sets. The similarity is
* computed as 1 - (sets' distance) / (maximum possible distance).
*
- * @param array $A
- * @param array $B
- * @return float [0,1]
+ * @param array $a Either a vector or a collection of tokens to be transformed to a vector
+ * @param array $b Either a vector or a collection of tokens to be transformed to a vector
*/
- public function similarity(&$A, &$B)
+ public function similarity(array &$a, array &$b): float
{
- return ($this->length-$this->dist($A,$B))/$this->length;
+ return ($this->length - $this->dist($a, $b)) / $this->length;
}
-
}
diff --git a/src/NlpTools/Similarity/SimilarityInterface.php b/src/NlpTools/Similarity/SimilarityInterface.php
index d63f7f6..3a6c8bf 100644
--- a/src/NlpTools/Similarity/SimilarityInterface.php
+++ b/src/NlpTools/Similarity/SimilarityInterface.php
@@ -1,5 +1,7 @@
$a Either feature vector or simply vector
+ * @param array $b Either feature vector or simply vector
+ */
+ public function similarity(array &$a, array &$b): float;
}
diff --git a/src/NlpTools/Similarity/TverskyIndex.php b/src/NlpTools/Similarity/TverskyIndex.php
index ccf6b67..7e2232e 100644
--- a/src/NlpTools/Similarity/TverskyIndex.php
+++ b/src/NlpTools/Similarity/TverskyIndex.php
@@ -1,5 +1,7 @@
alpha = $alpha;
- $this->beta = $beta;
}
/**
* Compute the similarity using the alpha and beta values given in the
* constructor.
- *
- * @param array $A
- * @param array $B
- * @return float
*/
- public function similarity(&$A, &$B)
+ public function similarity(array &$a, array &$b): float
{
$alpha = $this->alpha;
$beta = $this->beta;
- $a = array_fill_keys($A,1);
- $b = array_fill_keys($B,1);
+ $aa = array_fill_keys($a, 1);
+ $bb = array_fill_keys($b, 1);
- $min = min(count(array_diff_key($a,$b)),count(array_diff_key($b, $a)));
- $max = max(count(array_diff_key($a,$b)),count(array_diff_key($b, $a)));
+ $min = min(count(array_diff_key($aa, $bb)), count(array_diff_key($bb, $aa)));
+ $max = max(count(array_diff_key($aa, $bb)), count(array_diff_key($bb, $aa)));
- $intersect = count(array_intersect_key($a,$b));
+ $intersect = count(array_intersect_key($aa, $bb));
- return $intersect/($intersect + ($beta * ($alpha * $min + $max*(1-$alpha)) ));
+ return $intersect / ($intersect + ($beta * ($alpha * $min + $max * (1 - $alpha)) ));
}
- public function dist(&$A, &$B)
+ public function dist(array &$a, array &$b): float
{
- return 1-$this->similarity($A,$B);
+ return 1 - $this->similarity($a, $b);
}
}
diff --git a/src/NlpTools/Stemmers/GreekStemmer.php b/src/NlpTools/Stemmers/GreekStemmer.php
index c2ae22f..9c218de 100644
--- a/src/NlpTools/Stemmers/GreekStemmer.php
+++ b/src/NlpTools/Stemmers/GreekStemmer.php
@@ -1,5 +1,7 @@
"φα",
- "φαγιου"=>"φα",
- "φαγιων"=>"φα",
- "σκαγια"=>"σκα",
- "σκαγιου"=>"σκα",
- "σκαγιων"=>"σκα",
- "ολογιου"=>"ολο",
- "ολογια"=>"ολο",
- "ολογιων"=>"ολο",
- "σογιου"=>"σο",
- "σογια"=>"σο",
- "σογιων"=>"σο",
- "τατογια"=>"τατο",
- "τατογιου"=>"τατο",
- "τατογιων"=>"τατο",
- "κρεασ"=>"κρε",
- "κρεατοσ"=>"κρε",
- "κρεατα"=>"κρε",
- "κρεατων"=>"κρε",
- "περασ"=>"περ",
- "περατοσ"=>"περ",
- "περατα"=>"περ",
- "περατων"=>"περ",
- "τερασ"=>"τερ",
- "τερατοσ"=>"τερ",
- "τερατα"=>"τερ",
- "τερατων"=>"τερ",
- "φωσ"=>"φω",
- "φωτοσ"=>"φω",
- "φωτα"=>"φω",
- "φωτων"=>"φω",
- "καθεστωσ"=>"καθεστ",
- "καθεστωτοσ"=>"καθεστ",
- "καθεστωτα"=>"καθεστ",
- "καθεστωτων"=>"καθεστ",
- "γεγονοσ"=>"γεγον",
- "γεγονοτοσ"=>"γεγον",
- "γεγονοτα"=>"γεγον",
- "γεγονοτων"=>"γεγον"
- );
- protected static $step1regexp="/(.*)(φαγια|φαγιου|φαγιων|σκαγια|σκαγιου|σκαγιων|ολογιου|ολογια|ολογιων|σογιου|σογια|σογιων|τατογια|τατογιου|τατογιων|κρεασ|κρεατοσ|κρεατα|κρεατων|περασ|περατοσ|περατα|περατων|τερασ|τερατοσ|τερατα|τερατων|φωσ|φωτοσ|φωτα|φωτων|καθεστωσ|καθεστωτοσ|καθεστωτα|καθεστωτων|γεγονοσ|γεγονοτοσ|γεγονοτα|γεγονοτων)$/u";
- protected static $v = "[αεηιουω]";
- protected static $v2 = "[αεηιοω]";
-
- public function stem($w)
+ /**
+ * @var array
+ */
+ protected static array $step1list = ["φαγια" => "φα", "φαγιου" => "φα", "φαγιων" => "φα", "σκαγια" => "σκα", "σκαγιου" => "σκα", "σκαγιων" => "σκα", "ολογιου" => "ολο", "ολογια" => "ολο", "ολογιων" => "ολο", "σογιου" => "σο", "σογια" => "σο", "σογιων" => "σο", "τατογια" => "τατο", "τατογιου" => "τατο", "τατογιων" => "τατο", "κρεασ" => "κρε", "κρεατοσ" => "κρε", "κρεατα" => "κρε", "κρεατων" => "κρε", "περασ" => "περ", "περατοσ" => "περ", "περατα" => "περ", "περατων" => "περ", "τερασ" => "τερ", "τερατοσ" => "τερ", "τερατα" => "τερ", "τερατων" => "τερ", "φωσ" => "φω", "φωτοσ" => "φω", "φωτα" => "φω", "φωτων" => "φω", "καθεστωσ" => "καθεστ", "καθεστωτοσ" => "καθεστ", "καθεστωτα" => "καθεστ", "καθεστωτων" => "καθεστ", "γεγονοσ" => "γεγον", "γεγονοτοσ" => "γεγον", "γεγονοτα" => "γεγον", "γεγονοτων" => "γεγον"];
+
+ protected static string $step1regexp = "/(.*)(φαγια|φαγιου|φαγιων|σκαγια|σκαγιου|σκαγιων|ολογιου|ολογια|ολογιων|σογιου|σογια|σογιων|τατογια|τατογιου|τατογιων|κρεασ|κρεατοσ|κρεατα|κρεατων|περασ|περατοσ|περατα|περατων|τερασ|τερατοσ|τερατα|τερατων|φωσ|φωτοσ|φωτα|φωτων|καθεστωσ|καθεστωτοσ|καθεστωτα|καθεστωτων|γεγονοσ|γεγονοτοσ|γεγονοτα|γεγονοτων)$/u";
+
+ protected static string $v = "[αεηιουω]";
+
+ protected static string $v2 = "[αεηιοω]";
+
+ public function stem(string $w): string
{
-$word = $w;
- $stem="";
- $suffix="";
- $firstch="";
+ $stem = "";
+ $suffix = "";
$test1 = true;
@@ -71,10 +37,10 @@ public function stem($w)
}
//step1
- if (preg_match(self::$step1regexp,$w,$fp)) {
+ if (preg_match(self::$step1regexp, $w, $fp)) {
$stem = $fp[1];
$suffix = $fp[2];
- $w = $stem.self::$step1list[$suffix];
+ $w = $stem . self::$step1list[$suffix];
$test1 = false;
}
@@ -82,58 +48,58 @@ public function stem($w)
$re2 = "/^(.+?)(εδεσ|εδων)$/u";
$re3 = "/^(.+?)(ουδεσ|ουδων)$/u";
$re4 = "/^(.+?)(εωσ|εων)$/u";
- if (preg_match($re1,$w,$fp)) { // step 2a
+ if (preg_match($re1, $w, $fp)) { // step 2a
$stem = $fp[1];
$w = $stem;
$re = "/(οκ|μαμ|μαν|μπαμπ|πατερ|γιαγι|νταντ|κυρ|θει|πεθερ)$/u";
- if (!preg_match($re,$w)) {
+ if (preg_match($re, $w) === 0 || preg_match($re, $w) === false) {
$w .= "αδ";
}
- } elseif (preg_match($re2,$w,$fp)) { //step 2b
+ } elseif (preg_match($re2, $w, $fp)) { //step 2b
$stem = $fp[1];
$w = $stem;
$exept2 = "/(οπ|ιπ|εμπ|υπ|γηπ|δαπ|κρασπ|μιλ)$/u";
- if (preg_match($exept2,$w)) {
+ if (preg_match($exept2, $w)) {
$w .= "εδ";
}
- } elseif (preg_match($re3,$w,$fp)) { //step 2c
+ } elseif (preg_match($re3, $w, $fp)) { //step 2c
$stem = $fp[1];
$w = $stem;
$exept3 = "/(αρκ|καλιακ|πεταλ|λιχ|πλεξ|σκ|σ|φλ|φρ|βελ|λουλ|χν|σπ|τραγ|φε)$/u";
- if (preg_match($exept3,$w)) {
+ if (preg_match($exept3, $w)) {
$w .= "ουδ";
}
- } elseif (preg_match($re4,$w,$fp)) { //step 2d
+ } elseif (preg_match($re4, $w, $fp)) { //step 2d
$stem = $fp[1];
$w = $stem;
$test1 = false;
$exept4 = "/^(θ|δ|ελ|γαλ|ν|π|ιδ|παρ)$/u";
- if (preg_match($exept4,$w)) {
+ if (preg_match($exept4, $w)) {
$w .= "ε";
}
}
//step 3
$re = "/^(.+?)(ια|ιου|ιων)$/u";
- if (preg_match($re,$w,$fp)) {
+ if (preg_match($re, (string) $w, $fp)) {
$stem = $fp[1];
$w = $stem;
- $re = "/".self::$v."$/u";
+ $re = "/" . self::$v . "$/u";
$test1 = false;
- if (preg_match($re,$w)) {
- $w = $stem."ι";
+ if (preg_match($re, $w)) {
+ $w = $stem . "ι";
}
}
//step 4
$re = "/^(.+?)(ικα|ικο|ικου|ικων)$/u";
- if (preg_match($re,$w,$fp)) {
+ if (preg_match($re, (string) $w, $fp)) {
$stem = $fp[1];
$w = $stem;
$test1 = false;
- $re = "/".self::$v."$/u";
+ $re = "/" . self::$v . "$/u";
$exept5 = "/^(αλ|αδ|ενδ|αμαν|αμμοχαλ|ηθ|ανηθ|αντιδ|φυσ|βρωμ|γερ|εξωδ|καλπ|καλλιν|καταδ|μουλ|μπαν|μπαγιατ|μπολ|μποσ|νιτ|ξικ|συνομηλ|πετσ|πιτσ|πικαντ|πλιατσ|ποστελν|πρωτοδ|σερτ|συναδ|τσαμ|υποδ|φιλον|φυλοδ|χασ)$/u";
- if (preg_match($re,$w) || preg_match($exept5,$w)) {
+ if (preg_match($re, $w) || preg_match($exept5, $w)) {
$w .= "ικ";
}
}
@@ -162,123 +128,123 @@ public function stem($w)
return "αγαμ";
}
- if (preg_match($re2,$w,$fp)) {
+ if (preg_match($re2, (string) $w, $fp)) {
$stem = $fp[1];
$w = $stem;
$test1 = false;
- } elseif (preg_match($re,$w,$fp)) {
+ } elseif (preg_match($re, (string) $w, $fp)) {
$stem = $fp[1];
$w = $stem;
$test1 = false;
$exept6 = "/^(αναπ|αποθ|αποκ|αποστ|βουβ|ξεθ|ουλ|πεθ|πικρ|ποτ|σιχ|χ)$/u";
- if (preg_match($exept6,$w)) {
+ if (preg_match($exept6, $w)) {
$w .= "αμ";
}
- } elseif (preg_match($re4,$w,$fp)) { //step 5b
+ } elseif (preg_match($re4, (string) $w, $fp)) { //step 5b
$stem = $fp[1];
$w = $stem;
$test1 = false;
$re4 = "/^(τρ|τσ)$/u";
- if (preg_match($re4,$w)) {
+ if (preg_match($re4, $w)) {
$w .= "αγαν";
}
- } elseif (preg_match($re3,$w,$fp)) {
+ } elseif (preg_match($re3, (string) $w, $fp)) {
$stem = $fp[1];
$w = $stem;
$test1 = false;
- $re3 = "/".self::$v2."$/u";
+ $re3 = "/" . self::$v2 . "$/u";
$exept7 = "/^(βετερ|βουλκ|βραχμ|γ|δραδουμ|θ|καλπουζ|καστελ|κορμορ|λαοπλ|μωαμεθ|μ|μουσουλμ|ν|ουλ|π|πελεκ|πλ|πολισ|πορτολ|σαρακατσ|σουλτ|τσαρλατ|ορφ|τσιγγ|τσοπ|φωτοστεφ|χ|ψυχοπλ|αγ|ορφ|γαλ|γερ|δεκ|διπλ|αμερικαν|ουρ|πιθ|πουριτ|σ|ζωντ|ικ|καστ|κοπ|λιχ|λουθηρ|μαιντ|μελ|σιγ|σπ|στεγ|τραγ|τσαγ|φ|ερ|αδαπ|αθιγγ|αμηχ|ανικ|ανοργ|απηγ|απιθ|ατσιγγ|βασ|βασκ|βαθυγαλ|βιομηχ|βραχυκ|διατ|διαφ|ενοργ|θυσ|καπνοβιομηχ|καταγαλ|κλιβ|κοιλαρφ|λιβ|μεγλοβιομηχ|μικροβιομηχ|νταβ|ξηροκλιβ|ολιγοδαμ|ολογαλ|πενταρφ|περηφ|περιτρ|πλατ|πολυδαπ|πολυμηχ|στεφ|ταβ|τετ|υπερηφ|υποκοπ|χαμηλοδαπ|ψηλοταβ)$/u";
- if (preg_match($re3,$w) || preg_match($exept7,$w)) {
+ if (preg_match($re3, $w) || preg_match($exept7, $w)) {
$w .= "αν";
}
- } elseif (preg_match($re6,$w,$fp)) { //step 5c
+ } elseif (preg_match($re6, (string) $w, $fp)) { //step 5c
$stem = $fp[1];
$w = $stem;
$test1 = false;
- } elseif (preg_match($re5,$w,$fp)) {
+ } elseif (preg_match($re5, (string) $w, $fp)) {
$stem = $fp[1];
$w = $stem;
$test1 = false;
-// $re5 = $this->v2."$";
- $re5 = self::$v2."";
+ $re5 = self::$v2 . "";
$exept8 = "/(οδ|αιρ|φορ|ταθ|διαθ|σχ|ενδ|ευρ|τιθ|υπερθ|ραθ|ενθ|ροθ|σθ|πυρ|αιν|συνδ|συν|συνθ|χωρ|πον|βρ|καθ|ευθ|εκθ|νετ|ρον|αρκ|βαρ|βολ|ωφελ)$/u";
$exept9 = "/^(αβαρ|βεν|εναρ|αβρ|αδ|αθ|αν|απλ|βαρον|ντρ|σκ|κοπ|μπορ|νιφ|παγ|παρακαλ|σερπ|σκελ|συρφ|τοκ|υ|δ|εμ|θαρρ|θ)$/u";
- if (preg_match($re5,$w) || preg_match($exept8,$w)) {
+ if (preg_match($re5, $w) || preg_match($exept8, $w)) {
$w .= "ετ";
} elseif (preg_match($exept9, $w)) {
$w .= "ετ";
}
- } elseif (preg_match($re7,$w,$fp)) { //step 5d
+ } elseif (preg_match($re7, (string) $w, $fp)) { //step 5d
$stem = $fp[1];
$w = $stem;
$test1 = false;
$exept10 = "/^(αρχ)$/u";
$exept11 = "/(κρε)$/u";
- if (preg_match($exept10,$w)) {
+ if (preg_match($exept10, $w)) {
$w .= "οντ";
}
- if (preg_match($exept11,$w)) {
+
+ if (preg_match($exept11, $w)) {
$w .= "ωντ";
}
- } elseif (preg_match($re8,$w,$fp)) { //step 5e
+ } elseif (preg_match($re8, (string) $w, $fp)) { //step 5e
$stem = $fp[1];
$w = $stem;
$test1 = false;
$exept11 = "/^(ον)$/u";
- if (preg_match($exept11,$w)) {
+ if (preg_match($exept11, $w)) {
$w .= "ομαστ";
}
- } elseif (preg_match($re10,$w,$fp)) { //step 5f
+ } elseif (preg_match($re10, (string) $w, $fp)) { //step 5f
$stem = $fp[1];
$w = $stem;
$test1 = false;
$re10 = "/^(π|απ|συμπ|ασυμπ|ακαταπ|αμεταμφ)$/u";
- if (preg_match($re10,$w)) {
- $w .= "ιεστ";
+ if (preg_match($re10, $w)) {
+ $w .= "ιεστ";
}
- } elseif (preg_match($re9,$w,$fp)) {
+ } elseif (preg_match($re9, (string) $w, $fp)) {
$stem = $fp[1];
$w = $stem;
$test1 = false;
$exept12 = "/^(αλ|αρ|εκτελ|ζ|μ|ξ|παρακαλ|αρ|προ|νισ)$/u";
- if (preg_match($exept12,$w)) {
+ if (preg_match($exept12, $w)) {
$w .= "εστ";
}
- } elseif (preg_match($re12,$w,$fp)) { //step 5g
+ } elseif (preg_match($re12, (string) $w, $fp)) { //step 5g
$stem = $fp[1];
$w = $stem;
$test1 = false;
- } elseif (preg_match($re11,$w,$fp)) {
+ } elseif (preg_match($re11, (string) $w, $fp)) {
$stem = $fp[1];
$w = $stem;
$test1 = false;
$exept13 = "/(σκωλ|σκουλ|ναρθ|σφ|οθ|πιθ)$/u";
$exept14 = "/^(διαθ|θ|παρακαταθ|προσθ|συνθ|)$/u";
- if (preg_match($exept13,$w)) {
+ if (preg_match($exept13, $w)) {
$w .= "ηκ";
- } elseif (preg_match($exept14,$w)) {
+ } elseif (preg_match($exept14, $w)) {
$w .= "ηκ";
}
- } elseif (preg_match($re13,$w,$fp)) { //step 5h
+ } elseif (preg_match($re13, (string) $w, $fp)) { //step 5h
$stem = $fp[1];
$w = $stem;
$test1 = false;
$exept15 = "/^(φαρμακ|χαδ|αγκ|αναρρ|βρομ|εκλιπ|λαμπιδ|λεχ|μ|πατ|ρ|λ|μεδ|μεσαζ|υποτειν|αμ|αιθ|ανηκ|δεσποζ|ενδιαφερ|δε|δευτερευ|καθαρευ|πλε|τσα)$/u";
$exept16 = "/(ποδαρ|βλεπ|πανταχ|φρυδ|μαντιλ|μαλλ|κυματ|λαχ|ληγ|φαγ|ομ|πρωτ)$/u";
- if (preg_match($exept15,$w)) {
+ if (preg_match($exept15, $w)) {
$w .= "ουσ";
- } elseif (preg_match($exept16,$w)) {
+ } elseif (preg_match($exept16, $w)) {
$w .= "ουσ";
}
- } elseif (preg_match($re14,$w,$fp)) { //step 5i
+ } elseif (preg_match($re14, (string) $w, $fp)) { //step 5i
$stem = $fp[1];
$w = $stem;
$test1 = false;
@@ -288,44 +254,46 @@ public function stem($w)
$exept18 = "/^(αβαστ|πολυφ|αδηφ|παμφ|ρ|ασπ|αφ|αμαλ|αμαλλι|ανυστ|απερ|ασπαρ|αχαρ|δερβεν|δροσοπ|ξεφ|νεοπ|νομοτ|ολοπ|ομοτ|προστ|προσωποπ|συμπ|συντ|τ|υποτ|χαρ|αειπ|αιμοστ|ανυπ|αποτ|αρτιπ|διατ|εν|επιτ|κροκαλοπ|σιδηροπ|λ|ναυ|ουλαμ|ουρ|π|τρ|μ)$/u";
$exept19 = "/(οφ|πελ|χορτ|λλ|σφ|ρπ|φρ|πρ|λοχ|σμην)$/u";
- if((preg_match($exept18,$w) || preg_match($exept19,$w))
- && !(preg_match($exept17,$w) || preg_match($exept20,$w))) {
- $w .= "αγ";
+ if (
+ (preg_match($exept18, $w) || preg_match($exept19, $w))
+ && ((preg_match($exept17, $w) === 0 || preg_match($exept17, $w) === false) && (preg_match($exept20, $w) === 0 || preg_match($exept20, $w) === false))
+ ) {
+ $w .= "αγ";
}
- } elseif (preg_match($re15,$w,$fp)) { //step 5j
+ } elseif (preg_match($re15, (string) $w, $fp)) { //step 5j
$stem = $fp[1];
$w = $stem;
$test1 = false;
$exept21 = "/^(ν|χερσον|δωδεκαν|ερημον|μεγαλον|επταν)$/u";
- if (preg_match($exept21,$w)) {
+ if (preg_match($exept21, $w)) {
$w .= "ησ";
}
- } elseif (preg_match($re16,$w,$fp)) { //step 5k
+ } elseif (preg_match($re16, (string) $w, $fp)) { //step 5k
$stem = $fp[1];
$w = $stem;
$test1 = false;
$exept22 = "/^(ασβ|σβ|αχρ|χρ|απλ|αειμν|δυσχρ|ευχρ|κοινοχρ|παλιμψ)$/u";
- if (preg_match($exept22,$w)) {
+ if (preg_match($exept22, $w)) {
$w .= "ηστ";
}
- } elseif (preg_match($re17,$w,$fp)) { //step 5l
+ } elseif (preg_match($re17, (string) $w, $fp)) { //step 5l
$stem = $fp[1];
$w = $stem;
$test1 = false;
$exept23 = "/^(ν|ρ|σπι|στραβομουτσ|κακομουτσ|εξων)$/u";
- if (preg_match($exept23,$w)) {
+ if (preg_match($exept23, $w)) {
$w .= "ουν";
}
- } elseif (preg_match($re18,$w,$fp)) { //step 5l
+ } elseif (preg_match($re18, (string) $w, $fp)) { //step 5l
$stem = $fp[1];
$w = $stem;
$test1 = false;
$exept24 = "/^(παρασουσ|φ|χ|ωριοπλ|αζ|αλλοσουσ|ασουσ)$/u";
- if (preg_match($exept24,$w)) {
+ if (preg_match($exept24, $w)) {
$w .= "ουμ";
}
}
@@ -333,23 +301,23 @@ public function stem($w)
// step 6
$re = "/^(.+?)(ματα|ματων|ματοσ)$/u";
$re2 = "/^(.+?)(α|αγατε|αγαν|αει|αμαι|αν|ασ|ασαι|αται|αω|ε|ει|εισ|ειτε|εσαι|εσ|εται|ι|ιεμαι|ιεμαστε|ιεται|ιεσαι|ιεσαστε|ιομασταν|ιομουν|ιομουνα|ιονταν|ιοντουσαν|ιοσασταν|ιοσαστε|ιοσουν|ιοσουνα|ιοταν|ιουμα|ιουμαστε|ιουνται|ιουνταν|η|ηδεσ|ηδων|ηθει|ηθεισ|ηθειτε|ηθηκατε|ηθηκαν|ηθουν|ηθω|ηκατε|ηκαν|ησ|ησαν|ησατε|ησει|ησεσ|ησουν|ησω|ο|οι|ομαι|ομασταν|ομουν|ομουνα|ονται|ονταν|οντουσαν|οσ|οσασταν|οσαστε|οσουν|οσουνα|οταν|ου|ουμαι|ουμαστε|ουν|ουνται|ουνταν|ουσ|ουσαν|ουσατε|υ|υσ|ω|ων)$/u";
- if (preg_match($re,$w,$fp)) {
+ if (preg_match($re, (string) $w, $fp)) {
$stem = $fp[1];
$w = $stem . "μα";
}
- if (preg_match($re2,$w,$fp) && $test1) {
+
+ if (preg_match($re2, (string) $w, $fp) && $test1) {
$stem = $fp[1];
$w = $stem;
}
// step 7
$re = "/^(.+?)(εστερ|εστατ|οτερ|οτατ|υτερ|υτατ|ωτερ|ωτατ)$/u";
- if (preg_match($re,$w,$fp)) {
+ if (preg_match($re, (string) $w, $fp)) {
$stem = $fp[1];
$w = $stem;
}
return $w;
}
-
}
diff --git a/src/NlpTools/Stemmers/LancasterStemmer.php b/src/NlpTools/Stemmers/LancasterStemmer.php
index f9a2af5..7f8b985 100644
--- a/src/NlpTools/Stemmers/LancasterStemmer.php
+++ b/src/NlpTools/Stemmers/LancasterStemmer.php
@@ -1,6 +1,11 @@
*/
- protected $indexedRules = array();
+ protected array $indexedRules = [];
/**
* Used to check for vowels
- * @var VowelAbstractFactory
*/
- protected $vowelChecker = null;
+ protected VowelsAbstractFactory $vowelChecker;
/**
* Constructor loads the ruleset into memory
- * @param array $ruleSet the set of rules that will be used by the lancaster algorithm. if empty
+ * @param array $ruleSet the set of rules that will be used by the lancaster algorithm. if empty
* this will use the default ruleset embedded in the LancasterStemmer
*/
- public function __construct($ruleSet = array())
+ public function __construct(array $ruleSet = [])
{
//setup the default rule set
- if (empty($ruleSet)) {
+ if ($ruleSet === []) {
$ruleSet = LancasterStemmer::getDefaultRuleSet();
}
$this->indexRules($ruleSet);
- //only get the english vowel checker
+
$this->vowelChecker = VowelsAbstractFactory::factory("English");
}
/**
* Creates an chained hashtable using the lookup char as the key
- * @param array $rules
+ *
+ * @param array $rules
*/
- protected function indexRules(array $rules)
+ protected function indexRules(array $rules): void
{
- $this->indexedRules = array();
-
+ $this->indexedRules = [];
foreach ($rules as $rule) {
if (isset($this->indexedRules[$rule[self::LOOKUP_CHAR]])) {
$this->indexedRules[$rule[self::LOOKUP_CHAR]][] = $rule;
} else {
- $this->indexedRules[$rule[self::LOOKUP_CHAR]] = array($rule);
+ $this->indexedRules[$rule[self::LOOKUP_CHAR]] = [$rule];
}
}
}
@@ -76,18 +85,19 @@ protected function indexRules(array $rules)
* @param string $word The word that gets stemmed
* @return string The stemmed word
*/
- public function stem($word)
+ public function stem(string $word): string
{
$this->originalToken = $word;
// account for the case of the string being empty
- if (empty($word))
+ if ($word === '' || $word === '0') {
return $word;
+ }
//only iterate out loop if a rule is applied
do {
$ruleApplied = false;
- $lookupChar = $word[strlen($word)-1];
+ $lookupChar = $word[strlen($word) - 1];
//check that the last character is in the index, if not return the origin token
if (!array_key_exists($lookupChar, $this->indexedRules)) {
@@ -95,27 +105,30 @@ public function stem($word)
}
foreach ($this->indexedRules[$lookupChar] as $rule) {
- if(strrpos($word, substr($rule[self::ENDING_STRING],-1)) ===
- (strlen($word)-strlen($rule[self::ENDING_STRING]))){
-
+ if (
+ strrpos($word, substr((string) $rule[self::ENDING_STRING], -1)) ===
+ (strlen($word) - strlen((string) $rule[self::ENDING_STRING]))
+ ) {
if (!empty($rule[self::INTACT_FLAG])) {
-
- if($this->originalToken == $word &&
- $this->isAcceptable($word, (int) $rule[self::REMOVE_TOTAL])){
-
- $word = $this->applyRule($word, $rule);
- $ruleApplied = true;
+ if (
+ $this->originalToken === $word &&
+ $this->isAcceptable($word, (int) $rule[self::REMOVE_TOTAL])
+ ) {
+ $word = $this->applyRule($word, $rule);
+ $ruleApplied = true;
if ($rule[self::CONTINUE_FLAG] === '.') {
return $word;
}
+
break;
}
} elseif ($this->isAcceptable($word, (int) $rule[self::REMOVE_TOTAL])) {
$word = $this->applyRule($word, $rule);
$ruleApplied = true;
if ($rule[self::CONTINUE_FLAG] === '.') {
- return $word;
+ return $word;
}
+
break;
}
} else {
@@ -125,15 +138,14 @@ public function stem($word)
} while ($ruleApplied);
return $word;
-
}
/**
* Apply the lancaster rule and return the altered string.
* @param string $word word the rule is being applied on
- * @param array $rule An associative array containing all the data elements for applying to the word
+ * @param array $rule An associative array containing all the data elements for applying to the word
*/
- protected function applyRule($word, $rule)
+ protected function applyRule(string $word, array $rule): string
{
return substr_replace($word, $rule[self::APPEND_STRING], strlen($word) - $rule[self::REMOVE_TOTAL]);
}
@@ -144,832 +156,24 @@ protected function applyRule($word, $rule)
* @param int $removeTotal The number of characters to remove from the suffix
* @return boolean True is the word is acceptable
*/
- protected function isAcceptable($word, $removeTotal)
+ protected function isAcceptable(string $word, int $removeTotal): bool
{
$length = strlen($word) - $removeTotal;
- if ($this->vowelChecker->isVowel($word, 0)&& $length >= 2) {
- return true;
- } elseif($length >= 3 &&
- ($this->vowelChecker->isVowel($word, 1) || $this->vowelChecker->isVowel($word, 2))) {
+ if ($this->vowelChecker->isVowel($word, 0) && $length >= 2) {
return true;
}
- return false;
+ return $length >= 3 &&
+ ($this->vowelChecker->isVowel($word, 1) || $this->vowelChecker->isVowel($word, 2));
}
/**
* Contains an array with the default lancaster rules
- * @return array
+ *
+ * @return array
*/
- public static function getDefaultRuleSet()
+ public static function getDefaultRuleSet(): array
{
- return array(
- array(
- "lookup_char"=> "a",
- "ending_string"=> "ai",
- "intact_flag"=> "*",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "a",
- "ending_string"=> "a",
- "intact_flag"=> "*",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "b",
- "ending_string"=> "bb",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "c",
- "ending_string"=> "city",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "s",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "c",
- "ending_string"=> "ci",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "c",
- "ending_string"=> "cn",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "t",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "d",
- "ending_string"=> "dd",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "d",
- "ending_string"=> "dei",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "y",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "d",
- "ending_string"=> "deec",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "ss",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "d",
- "ending_string"=> "dee",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "d",
- "ending_string"=> "de",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "d",
- "ending_string"=> "dooh",
- "intact_flag"=> "",
- "remove_total"=> "4",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "e",
- "ending_string"=> "e",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "f",
- "ending_string"=> "feil",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "v",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "f",
- "ending_string"=> "fi",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "g",
- "ending_string"=> "gni",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "g",
- "ending_string"=> "gai",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "y",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "g",
- "ending_string"=> "ga",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "g",
- "ending_string"=> "gg",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "h",
- "ending_string"=> "ht",
- "intact_flag"=> "*",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "h",
- "ending_string"=> "hsiug",
- "intact_flag"=> "",
- "remove_total"=> "5",
- "append_string"=> "ct",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "h",
- "ending_string"=> "hsi",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "i",
- "ending_string"=> "i",
- "intact_flag"=> "*",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "i",
- "ending_string"=> "i",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "y",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "j",
- "ending_string"=> "ji",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "d",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "j",
- "ending_string"=> "juf",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "s",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "j",
- "ending_string"=> "ju",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "d",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "j",
- "ending_string"=> "jo",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "d",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "j",
- "ending_string"=> "jeh",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "r",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "j",
- "ending_string"=> "jrev",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "t",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "j",
- "ending_string"=> "jsim",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "t",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "j",
- "ending_string"=> "jn",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "d",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "j",
- "ending_string"=> "j",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "s",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "l",
- "ending_string"=> "lbaifi",
- "intact_flag"=> "",
- "remove_total"=> "6",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "l",
- "ending_string"=> "lbai",
- "intact_flag"=> "",
- "remove_total"=> "4",
- "append_string"=> "y",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "l",
- "ending_string"=> "lba",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "l",
- "ending_string"=> "lbi",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "l",
- "ending_string"=> "lib",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "l",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "l",
- "ending_string"=> "lc",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "l",
- "ending_string"=> "lufi",
- "intact_flag"=> "",
- "remove_total"=> "4",
- "append_string"=> "y",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "l",
- "ending_string"=> "luf",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "l",
- "ending_string"=> "lu",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "l",
- "ending_string"=> "lai",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "l",
- "ending_string"=> "lau",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "l",
- "ending_string"=> "la",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "l",
- "ending_string"=> "ll",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "m",
- "ending_string"=> "mui",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "m",
- "ending_string"=> "mu",
- "intact_flag"=> "*",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "m",
- "ending_string"=> "msi",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "m",
- "ending_string"=> "mm",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "n",
- "ending_string"=> "nois",
- "intact_flag"=> "",
- "remove_total"=> "4",
- "append_string"=> "j",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "n",
- "ending_string"=> "noix",
- "intact_flag"=> "",
- "remove_total"=> "4",
- "append_string"=> "ct",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "n",
- "ending_string"=> "noi",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "n",
- "ending_string"=> "nai",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "n",
- "ending_string"=> "na",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "n",
- "ending_string"=> "nee",
- "intact_flag"=> "",
- "remove_total"=> "0",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "n",
- "ending_string"=> "ne",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "n",
- "ending_string"=> "nn",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "p",
- "ending_string"=> "pihs",
- "intact_flag"=> "",
- "remove_total"=> "4",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "p",
- "ending_string"=> "pp",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "r",
- "ending_string"=> "re",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "r",
- "ending_string"=> "rae",
- "intact_flag"=> "",
- "remove_total"=> "0",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "r",
- "ending_string"=> "ra",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "r",
- "ending_string"=> "ro",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "r",
- "ending_string"=> "ru",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "r",
- "ending_string"=> "rr",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "r",
- "ending_string"=> "rt",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "r",
- "ending_string"=> "rei",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "y",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "s",
- "ending_string"=> "sei",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "y",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "s",
- "ending_string"=> "sis",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "s",
- "ending_string"=> "si",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "s",
- "ending_string"=> "ssen",
- "intact_flag"=> "",
- "remove_total"=> "4",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "s",
- "ending_string"=> "ss",
- "intact_flag"=> "",
- "remove_total"=> "0",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "s",
- "ending_string"=> "suo",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "s",
- "ending_string"=> "su",
- "intact_flag"=> "*",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "s",
- "ending_string"=> "s",
- "intact_flag"=> "*",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "s",
- "ending_string"=> "s",
- "intact_flag"=> "",
- "remove_total"=> "0",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "t",
- "ending_string"=> "tacilp",
- "intact_flag"=> "",
- "remove_total"=> "4",
- "append_string"=> "y",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "t",
- "ending_string"=> "ta",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "t",
- "ending_string"=> "tnem",
- "intact_flag"=> "",
- "remove_total"=> "4",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "t",
- "ending_string"=> "tne",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "t",
- "ending_string"=> "tna",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "t",
- "ending_string"=> "tpir",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "b",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "t",
- "ending_string"=> "tpro",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "b",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "t",
- "ending_string"=> "tcud",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "t",
- "ending_string"=> "tpmus",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "t",
- "ending_string"=> "tpec",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "iv",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "t",
- "ending_string"=> "tulo",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "v",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "t",
- "ending_string"=> "tsis",
- "intact_flag"=> "",
- "remove_total"=> "0",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "t",
- "ending_string"=> "tsi",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "t",
- "ending_string"=> "tt",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "u",
- "ending_string"=> "uqi",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "u",
- "ending_string"=> "ugo",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "v",
- "ending_string"=> "vis",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "j",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "v",
- "ending_string"=> "vie",
- "intact_flag"=> "",
- "remove_total"=> "0",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "v",
- "ending_string"=> "vi",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "y",
- "ending_string"=> "ylb",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "y",
- "ending_string"=> "yli",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "y",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "y",
- "ending_string"=> "ylp",
- "intact_flag"=> "",
- "remove_total"=> "0",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "y",
- "ending_string"=> "yl",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "y",
- "ending_string"=> "ygo",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "y",
- "ending_string"=> "yhp",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "y",
- "ending_string"=> "ymo",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "y",
- "ending_string"=> "ypo",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "y",
- "ending_string"=> "yti",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "y",
- "ending_string"=> "yte",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "y",
- "ending_string"=> "ytl",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "y",
- "ending_string"=> "yrtsi",
- "intact_flag"=> "",
- "remove_total"=> "5",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "y",
- "ending_string"=> "yra",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "y",
- "ending_string"=> "yro",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "y",
- "ending_string"=> "yfi",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "",
- "continue_flag"=> "."),
- array(
- "lookup_char"=> "y",
- "ending_string"=> "ycn",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "t",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "y",
- "ending_string"=> "yca",
- "intact_flag"=> "",
- "remove_total"=> "3",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "z",
- "ending_string"=> "zi",
- "intact_flag"=> "",
- "remove_total"=> "2",
- "append_string"=> "",
- "continue_flag"=> ">"),
- array(
- "lookup_char"=> "z",
- "ending_string"=> "zy",
- "intact_flag"=> "",
- "remove_total"=> "1",
- "append_string"=> "s",
- "continue_flag"=> ".")
- );
+ return [["lookup_char" => "a", "ending_string" => "ai", "intact_flag" => "*", "remove_total" => "2", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "a", "ending_string" => "a", "intact_flag" => "*", "remove_total" => "1", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "b", "ending_string" => "bb", "intact_flag" => "", "remove_total" => "1", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "c", "ending_string" => "city", "intact_flag" => "", "remove_total" => "3", "append_string" => "s", "continue_flag" => "."], ["lookup_char" => "c", "ending_string" => "ci", "intact_flag" => "", "remove_total" => "2", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "c", "ending_string" => "cn", "intact_flag" => "", "remove_total" => "1", "append_string" => "t", "continue_flag" => ">"], ["lookup_char" => "d", "ending_string" => "dd", "intact_flag" => "", "remove_total" => "1", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "d", "ending_string" => "dei", "intact_flag" => "", "remove_total" => "3", "append_string" => "y", "continue_flag" => ">"], ["lookup_char" => "d", "ending_string" => "deec", "intact_flag" => "", "remove_total" => "2", "append_string" => "ss", "continue_flag" => "."], ["lookup_char" => "d", "ending_string" => "dee", "intact_flag" => "", "remove_total" => "1", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "d", "ending_string" => "de", "intact_flag" => "", "remove_total" => "2", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "d", "ending_string" => "dooh", "intact_flag" => "", "remove_total" => "4", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "e", "ending_string" => "e", "intact_flag" => "", "remove_total" => "1", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "f", "ending_string" => "feil", "intact_flag" => "", "remove_total" => "1", "append_string" => "v", "continue_flag" => "."], ["lookup_char" => "f", "ending_string" => "fi", "intact_flag" => "", "remove_total" => "2", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "g", "ending_string" => "gni", "intact_flag" => "", "remove_total" => "3", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "g", "ending_string" => "gai", "intact_flag" => "", "remove_total" => "3", "append_string" => "y", "continue_flag" => "."], ["lookup_char" => "g", "ending_string" => "ga", "intact_flag" => "", "remove_total" => "2", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "g", "ending_string" => "gg", "intact_flag" => "", "remove_total" => "1", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "h", "ending_string" => "ht", "intact_flag" => "*", "remove_total" => "2", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "h", "ending_string" => "hsiug", "intact_flag" => "", "remove_total" => "5", "append_string" => "ct", "continue_flag" => "."], ["lookup_char" => "h", "ending_string" => "hsi", "intact_flag" => "", "remove_total" => "3", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "i", "ending_string" => "i", "intact_flag" => "*", "remove_total" => "1", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "i", "ending_string" => "i", "intact_flag" => "", "remove_total" => "1", "append_string" => "y", "continue_flag" => ">"], ["lookup_char" => "j", "ending_string" => "ji", "intact_flag" => "", "remove_total" => "1", "append_string" => "d", "continue_flag" => "."], ["lookup_char" => "j", "ending_string" => "juf", "intact_flag" => "", "remove_total" => "1", "append_string" => "s", "continue_flag" => "."], ["lookup_char" => "j", "ending_string" => "ju", "intact_flag" => "", "remove_total" => "1", "append_string" => "d", "continue_flag" => "."], ["lookup_char" => "j", "ending_string" => "jo", "intact_flag" => "", "remove_total" => "1", "append_string" => "d", "continue_flag" => "."], ["lookup_char" => "j", "ending_string" => "jeh", "intact_flag" => "", "remove_total" => "1", "append_string" => "r", "continue_flag" => "."], ["lookup_char" => "j", "ending_string" => "jrev", "intact_flag" => "", "remove_total" => "1", "append_string" => "t", "continue_flag" => "."], ["lookup_char" => "j", "ending_string" => "jsim", "intact_flag" => "", "remove_total" => "2", "append_string" => "t", "continue_flag" => "."], ["lookup_char" => "j", "ending_string" => "jn", "intact_flag" => "", "remove_total" => "1", "append_string" => "d", "continue_flag" => "."], ["lookup_char" => "j", "ending_string" => "j", "intact_flag" => "", "remove_total" => "1", "append_string" => "s", "continue_flag" => "."], ["lookup_char" => "l", "ending_string" => "lbaifi", "intact_flag" => "", "remove_total" => "6", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "l", "ending_string" => "lbai", "intact_flag" => "", "remove_total" => "4", "append_string" => "y", "continue_flag" => "."], ["lookup_char" => "l", "ending_string" => "lba", "intact_flag" => "", "remove_total" => "3", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "l", "ending_string" => "lbi", "intact_flag" => "", "remove_total" => "3", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "l", "ending_string" => "lib", "intact_flag" => "", "remove_total" => "2", "append_string" => "l", "continue_flag" => ">"], ["lookup_char" => "l", "ending_string" => "lc", "intact_flag" => "", "remove_total" => "1", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "l", "ending_string" => "lufi", "intact_flag" => "", "remove_total" => "4", "append_string" => "y", "continue_flag" => "."], ["lookup_char" => "l", "ending_string" => "luf", "intact_flag" => "", "remove_total" => "3", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "l", "ending_string" => "lu", "intact_flag" => "", "remove_total" => "2", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "l", "ending_string" => "lai", "intact_flag" => "", "remove_total" => "3", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "l", "ending_string" => "lau", "intact_flag" => "", "remove_total" => "3", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "l", "ending_string" => "la", "intact_flag" => "", "remove_total" => "2", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "l", "ending_string" => "ll", "intact_flag" => "", "remove_total" => "1", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "m", "ending_string" => "mui", "intact_flag" => "", "remove_total" => "3", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "m", "ending_string" => "mu", "intact_flag" => "*", "remove_total" => "2", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "m", "ending_string" => "msi", "intact_flag" => "", "remove_total" => "3", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "m", "ending_string" => "mm", "intact_flag" => "", "remove_total" => "1", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "n", "ending_string" => "nois", "intact_flag" => "", "remove_total" => "4", "append_string" => "j", "continue_flag" => ">"], ["lookup_char" => "n", "ending_string" => "noix", "intact_flag" => "", "remove_total" => "4", "append_string" => "ct", "continue_flag" => "."], ["lookup_char" => "n", "ending_string" => "noi", "intact_flag" => "", "remove_total" => "3", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "n", "ending_string" => "nai", "intact_flag" => "", "remove_total" => "3", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "n", "ending_string" => "na", "intact_flag" => "", "remove_total" => "2", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "n", "ending_string" => "nee", "intact_flag" => "", "remove_total" => "0", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "n", "ending_string" => "ne", "intact_flag" => "", "remove_total" => "2", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "n", "ending_string" => "nn", "intact_flag" => "", "remove_total" => "1", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "p", "ending_string" => "pihs", "intact_flag" => "", "remove_total" => "4", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "p", "ending_string" => "pp", "intact_flag" => "", "remove_total" => "1", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "r", "ending_string" => "re", "intact_flag" => "", "remove_total" => "2", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "r", "ending_string" => "rae", "intact_flag" => "", "remove_total" => "0", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "r", "ending_string" => "ra", "intact_flag" => "", "remove_total" => "2", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "r", "ending_string" => "ro", "intact_flag" => "", "remove_total" => "2", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "r", "ending_string" => "ru", "intact_flag" => "", "remove_total" => "2", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "r", "ending_string" => "rr", "intact_flag" => "", "remove_total" => "1", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "r", "ending_string" => "rt", "intact_flag" => "", "remove_total" => "1", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "r", "ending_string" => "rei", "intact_flag" => "", "remove_total" => "3", "append_string" => "y", "continue_flag" => ">"], ["lookup_char" => "s", "ending_string" => "sei", "intact_flag" => "", "remove_total" => "3", "append_string" => "y", "continue_flag" => ">"], ["lookup_char" => "s", "ending_string" => "sis", "intact_flag" => "", "remove_total" => "2", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "s", "ending_string" => "si", "intact_flag" => "", "remove_total" => "2", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "s", "ending_string" => "ssen", "intact_flag" => "", "remove_total" => "4", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "s", "ending_string" => "ss", "intact_flag" => "", "remove_total" => "0", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "s", "ending_string" => "suo", "intact_flag" => "", "remove_total" => "3", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "s", "ending_string" => "su", "intact_flag" => "*", "remove_total" => "2", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "s", "ending_string" => "s", "intact_flag" => "*", "remove_total" => "1", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "s", "ending_string" => "s", "intact_flag" => "", "remove_total" => "0", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "t", "ending_string" => "tacilp", "intact_flag" => "", "remove_total" => "4", "append_string" => "y", "continue_flag" => "."], ["lookup_char" => "t", "ending_string" => "ta", "intact_flag" => "", "remove_total" => "2", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "t", "ending_string" => "tnem", "intact_flag" => "", "remove_total" => "4", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "t", "ending_string" => "tne", "intact_flag" => "", "remove_total" => "3", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "t", "ending_string" => "tna", "intact_flag" => "", "remove_total" => "3", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "t", "ending_string" => "tpir", "intact_flag" => "", "remove_total" => "2", "append_string" => "b", "continue_flag" => "."], ["lookup_char" => "t", "ending_string" => "tpro", "intact_flag" => "", "remove_total" => "2", "append_string" => "b", "continue_flag" => "."], ["lookup_char" => "t", "ending_string" => "tcud", "intact_flag" => "", "remove_total" => "1", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "t", "ending_string" => "tpmus", "intact_flag" => "", "remove_total" => "2", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "t", "ending_string" => "tpec", "intact_flag" => "", "remove_total" => "2", "append_string" => "iv", "continue_flag" => "."], ["lookup_char" => "t", "ending_string" => "tulo", "intact_flag" => "", "remove_total" => "2", "append_string" => "v", "continue_flag" => "."], ["lookup_char" => "t", "ending_string" => "tsis", "intact_flag" => "", "remove_total" => "0", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "t", "ending_string" => "tsi", "intact_flag" => "", "remove_total" => "3", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "t", "ending_string" => "tt", "intact_flag" => "", "remove_total" => "1", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "u", "ending_string" => "uqi", "intact_flag" => "", "remove_total" => "3", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "u", "ending_string" => "ugo", "intact_flag" => "", "remove_total" => "1", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "v", "ending_string" => "vis", "intact_flag" => "", "remove_total" => "3", "append_string" => "j", "continue_flag" => ">"], ["lookup_char" => "v", "ending_string" => "vie", "intact_flag" => "", "remove_total" => "0", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "v", "ending_string" => "vi", "intact_flag" => "", "remove_total" => "2", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "y", "ending_string" => "ylb", "intact_flag" => "", "remove_total" => "1", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "y", "ending_string" => "yli", "intact_flag" => "", "remove_total" => "3", "append_string" => "y", "continue_flag" => ">"], ["lookup_char" => "y", "ending_string" => "ylp", "intact_flag" => "", "remove_total" => "0", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "y", "ending_string" => "yl", "intact_flag" => "", "remove_total" => "2", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "y", "ending_string" => "ygo", "intact_flag" => "", "remove_total" => "1", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "y", "ending_string" => "yhp", "intact_flag" => "", "remove_total" => "1", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "y", "ending_string" => "ymo", "intact_flag" => "", "remove_total" => "1", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "y", "ending_string" => "ypo", "intact_flag" => "", "remove_total" => "1", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "y", "ending_string" => "yti", "intact_flag" => "", "remove_total" => "3", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "y", "ending_string" => "yte", "intact_flag" => "", "remove_total" => "3", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "y", "ending_string" => "ytl", "intact_flag" => "", "remove_total" => "2", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "y", "ending_string" => "yrtsi", "intact_flag" => "", "remove_total" => "5", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "y", "ending_string" => "yra", "intact_flag" => "", "remove_total" => "3", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "y", "ending_string" => "yro", "intact_flag" => "", "remove_total" => "3", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "y", "ending_string" => "yfi", "intact_flag" => "", "remove_total" => "3", "append_string" => "", "continue_flag" => "."], ["lookup_char" => "y", "ending_string" => "ycn", "intact_flag" => "", "remove_total" => "2", "append_string" => "t", "continue_flag" => ">"], ["lookup_char" => "y", "ending_string" => "yca", "intact_flag" => "", "remove_total" => "3", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "z", "ending_string" => "zi", "intact_flag" => "", "remove_total" => "2", "append_string" => "", "continue_flag" => ">"], ["lookup_char" => "z", "ending_string" => "zy", "intact_flag" => "", "remove_total" => "1", "append_string" => "s", "continue_flag" => "."]];
}
-
}
diff --git a/src/NlpTools/Stemmers/PorterStemmer.php b/src/NlpTools/Stemmers/PorterStemmer.php
index 2b38bef..c474283 100644
--- a/src/NlpTools/Stemmers/PorterStemmer.php
+++ b/src/NlpTools/Stemmers/PorterStemmer.php
@@ -1,5 +1,7 @@
'a','e'=>'e','i'=>'i','o'=>'o','u'=>'u');
+ /**
+ * Isset is faster than switch in php even for one character switches
+ *
+ * @var array
+ */
+ protected static array $vowels = ['a' => 'a', 'e' => 'e', 'i' => 'i', 'o' => 'o', 'u' => 'u'];
/**
* Quoting from the original C implementation.
*
- * > The main part of the stemming algorithm starts here. b is a buffer
- * > holding the word to be stemmed. The letters are in b[k0], b[k0+1] ...
- * > ending at b[k]. In fact k0 = 0 in this demo program. k is readjusted
- * > downwards as the stemming progresses. Zero termination is not in fact
- * > used in the algorithm.
- * >
- * > Note that only lower case sequences are stemmed. Forcing to lower case
- * > should be done before stem(...) is called.
+ * > The main part of the stemming algorithm starts here. b is a buffer
+ * > holding the word to be stemmed. The letters are in b[k0], b[k0+1] ...
+ * > ending at b[k]. In fact k0 = 0 in this demo program. k is readjusted
+ * > downwards as the stemming progresses. Zero termination is not in fact
+ * > used in the algorithm.
+ * >
+ * > Note that only lower case sequences are stemmed. Forcing to lower case
+ * > should be done before stem(...) is called.
*
* $b is a string holding one lower case word. $k0 is always 0 in
* our case so it is removed. $k is readjusted to point to the end
@@ -42,23 +48,29 @@ class PorterStemmer extends Stemmer
* the stem.
*
*/
- private $b;
- private $k,$j;
+ private string $b;
+
+ private int $k;
+
+ private int $j;
/* cons(i) is TRUE <=> b[i] is a consonant. */
- protected function cons($i)
+ protected function cons(int $i): bool
{
- if ($i>$this->k) {
+ if ($i > $this->k) {
return true;
}
+
$c = $this->b[$i];
if (isset(self::$vowels[$c])) {
return false;
- } elseif ($c==='y') {
- return ($i===0) ? true : !$this->cons($i-1);
- } else {
- return true;
}
+
+ if ($c === 'y') {
+ return ($i === 0) ? true : !$this->cons($i - 1);
+ }
+
+ return true;
}
/*
@@ -72,57 +84,80 @@ protected function cons($i)
* vcvcvc gives 3
* ....
* */
- protected function m()
+ protected function m(): ?int
{
$n = 0;
$i = 0;
while (true) {
- if ($i > $this->j)
+ if ($i > $this->j) {
return $n;
- if (! $this->cons($i))
+ }
+
+ if (!$this->cons($i)) {
break;
+ }
+
$i++;
}
+
$i++;
while (true) {
while (true) {
- if ($i > $this->j)
+ if ($i > $this->j) {
return $n;
- if ($this->cons($i))
+ }
+
+ if ($this->cons($i)) {
break;
+ }
+
$i++;
}
+
$i++;
$n++;
while (true) {
- if ($i > $this->j)
+ if ($i > $this->j) {
return $n;
- if (! $this->cons($i))
+ }
+
+ if (!$this->cons($i)) {
break;
+ }
+
$i++;
}
+
$i++;
}
+
+ // @phpstan-ignore-next-line
+ return null;
}
/* vowelinstem() is TRUE <=> 0,...j contains a vowel */
- protected function vowelinstem()
+ protected function vowelinstem(): bool
{
for ($i = 0; $i <= $this->j; $i++) {
- if (! $this->cons($i))
+ if (!$this->cons($i)) {
return true;
+ }
}
return false;
}
/* doublec(j) is TRUE <=> j,(j-1) contain a double consonant. */
- protected function doublec($j)
+ protected function doublec(int $j): bool
{
- if ($j < 1)
+ if ($j < 1) {
return false;
- if ($this->b[$j] != $this->b[$j-1])
+ }
+
+ if ($this->b[$j] !== $this->b[$j - 1]) {
return false;
+ }
+
return $this->cons($j);
}
@@ -135,32 +170,37 @@ protected function doublec($j)
* snow, box, tray.
*
* */
- protected function cvc($i)
+ protected function cvc(int $i): bool
{
- if ($i < 2 || !$this->cons($i) || $this->cons($i-1) || !$this->cons($i-2))
- return false;
- $ch = $this->b[$i];
- if ($ch === 'w' || $ch === 'x' || $ch === 'y')
+ if ($i < 2 || !$this->cons($i) || $this->cons($i - 1) || !$this->cons($i - 2)) {
return false;
+ }
- return true;
+ $ch = $this->b[$i];
+ return !($ch === 'w' || $ch === 'x' || $ch === 'y');
}
/*
* ends(s) is TRUE <=> 0...k ends with the string s.
*
* $length is passed as a parameter because it provides a speedup.
- * */
- protected function ends($s,$length)
+ *
+ */
+ protected function ends(string $s, int $length): bool
{
- if ($s[$length-1] != $this->b[$this->k])
+ if ($s[$length - 1] !== $this->b[$this->k]) {
return false;
- if ($length >= $this->k+1)
+ }
+
+ if ($length >= $this->k + 1) {
return false;
- if (substr_compare($this->b,$s,$this->k-$length+1,$length)!=0)
+ }
+
+ if (substr_compare($this->b, $s, $this->k - $length + 1, $length) !== 0) {
return false;
+ }
- $this->j = $this->k-$length;
+ $this->j = $this->k - $length;
return true;
}
@@ -171,16 +211,17 @@ protected function ends($s,$length)
*
* Again $length is passed for speedup
* */
- protected function setto($s,$length)
+ protected function setto(string $s, int $length): void
{
- $this->b = substr_replace($this->b,$s,$this->j+1);
- $this->k = $this->j+$length;
+ $this->b = substr_replace($this->b, $s, $this->j + 1);
+ $this->k = $this->j + $length;
}
- protected function r($s,$length)
+ protected function r(string $s, int $length): void
{
- if ($this->m()>0)
- $this->setto($s,$length);
+ if ($this->m() > 0) {
+ $this->setto($s, $length);
+ }
}
/*
@@ -205,34 +246,38 @@ protected function r($s,$length)
* meetings -> meet
*
* */
- protected function step1ab()
+ protected function step1ab(): void
{
if ($this->b[$this->k] === 's') {
- if ($this->ends("sses",4))
+ if ($this->ends("sses", 4)) {
$this->k -= 2;
- else if ($this->ends("ies",3))
- $this->setto("i",1);
- else if ($this->b[$this->k-1] !== 's')
+ } elseif ($this->ends("ies", 3)) {
+ $this->setto("i", 1);
+ } elseif ($this->b[$this->k - 1] !== 's') {
$this->k--;
+ }
}
- if ($this->ends("eed",3)) {
- if ($this->m() > 0)
+
+ if ($this->ends("eed", 3)) {
+ if ($this->m() > 0) {
$this->k--;
- } elseif (($this->ends("ed",2) || $this->ends("ing",3)) && $this->vowelinstem()) {
+ }
+ } elseif (($this->ends("ed", 2) || $this->ends("ing", 3)) && $this->vowelinstem()) {
$this->k = $this->j;
- if ($this->ends("at",2))
- $this->setto("ate",3);
- else if ($this->ends("bl",2))
- $this->setto("ble",3);
- else if ($this->ends("iz",2))
- $this->setto("ize",3);
- else if ($this->doublec($this->k)) {
+ if ($this->ends("at", 2)) {
+ $this->setto("ate", 3);
+ } elseif ($this->ends("bl", 2)) {
+ $this->setto("ble", 3);
+ } elseif ($this->ends("iz", 2)) {
+ $this->setto("ize", 3);
+ } elseif ($this->doublec($this->k)) {
$this->k--;
$ch = $this->b[$this->k];
- if ($ch === 'l' || $ch === 's' || $ch === 'z')
+ if ($ch === 'l' || $ch === 's' || $ch === 'z') {
$this->k++;
+ }
} elseif ($this->m() === 1 && $this->cvc($this->k)) {
- $this->setto("e",1);
+ $this->setto("e", 1);
}
}
}
@@ -242,10 +287,11 @@ protected function step1ab()
* vowel in the stem.
*
* */
- protected function step1c()
+ protected function step1c(): void
{
- if ($this->ends("y",1) && $this->vowelinstem())
+ if ($this->ends("y", 1) && $this->vowelinstem()) {
$this->b[$this->k] = 'i';
+ }
}
/*
@@ -254,48 +300,131 @@ protected function step1c()
* before the suffix must give m() > 0.
*
* */
- protected function step2()
+ protected function step2(): void
{
- switch ($this->b[$this->k-1]) {
+ switch ($this->b[$this->k - 1]) {
case 'a':
- if ($this->ends("ational",7)) { $this->r("ate",3); break; }
- if ($this->ends("tional",6)) { $this->r("tion",4); break; }
+ if ($this->ends("ational", 7)) {
+ $this->r("ate", 3);
+ break;
+ }
+
+ if ($this->ends("tional", 6)) {
+ $this->r("tion", 4);
+ break;
+ }
+
break;
case 'c':
- if ($this->ends("enci",4)) { $this->r("ence",4); break; }
- if ($this->ends("anci",4)) { $this->r("ance",4); break; }
+ if ($this->ends("enci", 4)) {
+ $this->r("ence", 4);
+ break;
+ }
+
+ if ($this->ends("anci", 4)) {
+ $this->r("ance", 4);
+ break;
+ }
+
break;
case 'e':
- if ($this->ends("izer",4)) { $this->r("ize",3); break; }
+ if ($this->ends("izer", 4)) {
+ $this->r("ize", 3);
+ break;
+ }
+
break;
case 'l':
- if ($this->ends("bli",3)) { $this->r("ble",3); break; }
+ if ($this->ends("bli", 3)) {
+ $this->r("ble", 3);
+ break;
+ }
+
// -DEPARTURE-
// To match the published algorithm, replace the above line with
// if ($this->ends("abli",4)) { $this->r("able",4); break; }
- if ($this->ends("alli",4)) { $this->r("al",2); break; }
- if ($this->ends("entli",5)) { $this->r("ent",3); break; }
- if ($this->ends("eli",3)) { $this->r("e",1); break; }
- if ($this->ends("ousli",5)) { $this->r("ous",3); break; }
+ if ($this->ends("alli", 4)) {
+ $this->r("al", 2);
+ break;
+ }
+
+ if ($this->ends("entli", 5)) {
+ $this->r("ent", 3);
+ break;
+ }
+
+ if ($this->ends("eli", 3)) {
+ $this->r("e", 1);
+ break;
+ }
+
+ if ($this->ends("ousli", 5)) {
+ $this->r("ous", 3);
+ break;
+ }
+
break;
case 'o':
- if ($this->ends("ization",7)) { $this->r("ize",3); break; }
- if ($this->ends("ation",5)) { $this->r("ate",3); break; }
- if ($this->ends("ator",4)) { $this->r("ate",3); break; }
+ if ($this->ends("ization", 7)) {
+ $this->r("ize", 3);
+ break;
+ }
+
+ if ($this->ends("ation", 5)) {
+ $this->r("ate", 3);
+ break;
+ }
+
+ if ($this->ends("ator", 4)) {
+ $this->r("ate", 3);
+ break;
+ }
+
break;
case 's':
- if ($this->ends("alism",5)) { $this->r("al",2); break; }
- if ($this->ends("iveness",7)) { $this->r("ive",3); break; }
- if ($this->ends("fulness",7)) { $this->r("ful",3); break; }
- if ($this->ends("ousness",7)) { $this->r("ous",3); break; }
+ if ($this->ends("alism", 5)) {
+ $this->r("al", 2);
+ break;
+ }
+
+ if ($this->ends("iveness", 7)) {
+ $this->r("ive", 3);
+ break;
+ }
+
+ if ($this->ends("fulness", 7)) {
+ $this->r("ful", 3);
+ break;
+ }
+
+ if ($this->ends("ousness", 7)) {
+ $this->r("ous", 3);
+ break;
+ }
+
break;
case 't':
- if ($this->ends("aliti",5)) { $this->r("al",2); break; }
- if ($this->ends("iviti",5)) { $this->r("ive",3); break; }
- if ($this->ends("biliti",6)) { $this->r("ble",3); break; }
+ if ($this->ends("aliti", 5)) {
+ $this->r("al", 2);
+ break;
+ }
+
+ if ($this->ends("iviti", 5)) {
+ $this->r("ive", 3);
+ break;
+ }
+
+ if ($this->ends("biliti", 6)) {
+ $this->r("ble", 3);
+ break;
+ }
+
break;
case 'g':
- if ($this->ends("logi",4)) { $this->r("log",3); break; }
+ if ($this->ends("logi", 4)) {
+ $this->r("log", 3);
+ break;
+ }
// -DEPARTURE-
// To match the published algorithm delete the above line
}
@@ -306,110 +435,163 @@ protected function step2()
* to step2.
*
* */
- protected function step3()
+ protected function step3(): void
{
switch ($this->b[$this->k]) {
case 'e':
- if ($this->ends("icate",5)) { $this->r("ic",2); break; }
- if ($this->ends("ative",5)) { $this->r("",0); break; }
- if ($this->ends("alize",5)) { $this->r("al",2); break; }
+ if ($this->ends("icate", 5)) {
+ $this->r("ic", 2);
+ break;
+ }
+
+ if ($this->ends("ative", 5)) {
+ $this->r("", 0);
+ break;
+ }
+
+ if ($this->ends("alize", 5)) {
+ $this->r("al", 2);
+ break;
+ }
+
break;
case 'i':
- if ($this->ends("iciti",5)) { $this->r("ic",2); break; }
+ if ($this->ends("iciti", 5)) {
+ $this->r("ic", 2);
+ break;
+ }
+
break;
case 'l':
- if ($this->ends("ical",4)) { $this->r("ic",2); break; }
- if ($this->ends("ful",3)) { $this->r("",0); break; }
+ if ($this->ends("ical", 4)) {
+ $this->r("ic", 2);
+ break;
+ }
+
+ if ($this->ends("ful", 3)) {
+ $this->r("", 0);
+ break;
+ }
+
break;
case 's':
- if ($this->ends("ness",4)) { $this->r("",0); break; }
+ if ($this->ends("ness", 4)) {
+ $this->r("", 0);
+ break;
+ }
+
break;
}
}
/* step4() takes off -ant, -ence etc., in context vcvc. */
- protected function step4()
+ protected function step4(): void
{
- switch ($this->b[$this->k-1]) {
+ switch ($this->b[$this->k - 1]) {
case 'a':
- if ($this->ends("al",2))
+ if ($this->ends("al", 2)) {
break;
+ }
return;
case 'c':
- if ($this->ends("ance",4))
+ if ($this->ends("ance", 4)) {
break;
- if ($this->ends("ence",4))
+ }
+
+ if ($this->ends("ence", 4)) {
break;
+ }
return;
case 'e':
- if ($this->ends("er",2))
+ if ($this->ends("er", 2)) {
break;
+ }
return;
case 'i':
- if ($this->ends("ic",2))
+ if ($this->ends("ic", 2)) {
break;
+ }
return;
case 'l':
- if ($this->ends("able",4))
+ if ($this->ends("able", 4)) {
break;
- if ($this->ends("ible",4))
+ }
+
+ if ($this->ends("ible", 4)) {
break;
+ }
return;
case 'n':
- if ($this->ends("ant",3))
+ if ($this->ends("ant", 3)) {
break;
- if ($this->ends("ement",5))
+ }
+
+ if ($this->ends("ement", 5)) {
break;
- if ($this->ends("ment",4))
+ }
+
+ if ($this->ends("ment", 4)) {
break;
- if ($this->ends("ent",3))
+ }
+
+ if ($this->ends("ent", 3)) {
break;
+ }
return;
case 'o':
- if ($this->ends("ion",3) && ($this->b[$this->j] === 's' || $this->b[$this->j] === 't'))
+ if ($this->ends("ion", 3) && ($this->b[$this->j] === 's' || $this->b[$this->j] === 't')) {
break;
- if ($this->ends("ou",2))
+ }
+
+ if ($this->ends("ou", 2)) {
break;
+ }
return;
/* takes care of -ous */
case 's':
- if ($this->ends("ism",3))
+ if ($this->ends("ism", 3)) {
break;
+ }
return;
case 't':
- if ($this->ends("ate",3))
+ if ($this->ends("ate", 3)) {
break;
- if ($this->ends("iti",3))
+ }
+
+ if ($this->ends("iti", 3)) {
break;
+ }
return;
case 'u':
- if ($this->ends("ous",3))
+ if ($this->ends("ous", 3)) {
break;
+ }
return;
case 'v':
- if ($this->ends("ive",3))
+ if ($this->ends("ive", 3)) {
break;
+ }
return;
case 'z':
- if ($this->ends("ize",3))
+ if ($this->ends("ize", 3)) {
break;
+ }
return;
default:
return;
}
- if ($this->m() > 1) $this->k = $this->j;
}
/*
@@ -417,30 +599,33 @@ protected function step4()
* changes -ll to -l if m() > 1.
*
* */
- protected function step5()
+ protected function step5(): void
{
$this->j = $this->k;
if ($this->b[$this->k] === 'e') {
$a = $this->m();
- if ($a > 1 || $a == 1 && !$this->cvc($this->k-1))
+ if ($a > 1 || $a === 1 && !$this->cvc($this->k - 1)) {
$this->k--;
+ }
}
- if ($this->b[$this->k] === 'l' && $this->doublec($this->k) && $this->m() > 1)
+
+ if ($this->b[$this->k] === 'l' && $this->doublec($this->k) && $this->m() > 1) {
$this->k--;
+ }
}
/**
* The word must be a lower case one byte per character string (in
* English).
- *
*/
- public function stem($word)
+ public function stem(string $word): string
{
- $this->j=0;
+ $this->j = 0;
$this->b = $word;
- $this->k = strlen($word)-1;
- if ($this->k<=1)
+ $this->k = strlen($word) - 1;
+ if ($this->k <= 1) {
return $word;
+ }
$this->step1ab();
$this->step1c();
@@ -449,6 +634,6 @@ public function stem($word)
$this->step4();
$this->step5();
- return substr($this->b,0,$this->k+1);
+ return substr($this->b, 0, $this->k + 1);
}
}
diff --git a/src/NlpTools/Stemmers/RegexStemmer.php b/src/NlpTools/Stemmers/RegexStemmer.php
index 36c2c66..e643e30 100644
--- a/src/NlpTools/Stemmers/RegexStemmer.php
+++ b/src/NlpTools/Stemmers/RegexStemmer.php
@@ -1,5 +1,7 @@
regex = $regexstr;
- $this->min = $min;
}
- public function stem($word)
+ public function stem(string $word): string
{
- if (mb_strlen($word,'utf-8')>=$this->min)
- return preg_replace($this->regex,'',$word);
+ if (mb_strlen($word, 'utf-8') >= $this->min) {
+ return preg_replace($this->regex, '', $word);
+ }
+
return $word;
}
-
}
diff --git a/src/NlpTools/Stemmers/Stemmer.php b/src/NlpTools/Stemmers/Stemmer.php
index e1560fa..fa86f83 100644
--- a/src/NlpTools/Stemmers/Stemmer.php
+++ b/src/NlpTools/Stemmers/Stemmer.php
@@ -1,5 +1,7 @@
$tokens
+ * @return array
*/
- public function stemAll(array $tokens)
+ public function stemAll(array $tokens): array
{
- return array_map(array($this,'stem'),$tokens);
+ return array_map($this->stem(...), $tokens);
}
/**
* A stemmer's transformation is simply the replacing of a word
* with its stem.
*/
- public function transform($word)
+ public function transform(string $word): ?string
{
return $this->stem($word);
}
diff --git a/src/NlpTools/Tokenizers/ClassifierBasedTokenizer.php b/src/NlpTools/Tokenizers/ClassifierBasedTokenizer.php
index 3bf4cc8..cfaa401 100644
--- a/src/NlpTools/Tokenizers/ClassifierBasedTokenizer.php
+++ b/src/NlpTools/Tokenizers/ClassifierBasedTokenizer.php
@@ -1,9 +1,12 @@
+ */
+ protected static array $classSet = ['O', 'EOW'];
- // used when joining the tokens into one
- protected $sep;
+ // initial tokenizer
+ protected TokenizerInterface $tok;
- public function __construct(ClassifierInterface $cls, TokenizerInterface $tok=null,$sep=' ')
+ public function __construct(protected ClassifierInterface $classifier, ?TokenizerInterface $tokenizer = null, protected string $sep = ' ')
{
- if ($tok == null) {
- $this->tok = new WhitespaceAndPunctuationTokenizer();
- } else {
- $this->tok = $tok;
- }
- $this->classifier = $cls;
- $this->sep = $sep;
+ $this->tok = $tokenizer == null ? new WhitespaceAndPunctuationTokenizer() : $tokenizer;
}
/**
@@ -72,32 +68,32 @@ public function __construct(ClassifierInterface $cls, TokenizerInterface $tok=nu
* 3. For each token that is not an EOW add it to the next EOW token using a separator
*
* @param string $str The character sequence to be broken in tokens
- * @return array The token array
+ * @return array The token array
*/
- public function tokenize($str)
+ public function tokenize(string $str): array
{
// split the string in tokens and create documents to be
// classified
$tokens = $this->tok->tokenize($str);
- $docs = array();
- foreach ($tokens as $offset=>$tok) {
- $docs[] = new WordDocument($tokens,$offset,5);
+ $docs = [];
+ foreach (array_keys($tokens) as $offset) {
+ $docs[] = new WordDocument($tokens, $offset, 5);
}
// classify each token as an EOW or O
- $tags = array();
+ $tags = [];
foreach ($docs as $doc) {
$tags[] = $this->classifier->classify(self::$classSet, $doc);
}
// merge O and EOW into real tokens
- $realtokens = array();
- $currentToken = array();
- foreach ($tokens as $offset=>$tok) {
+ $realtokens = [];
+ $currentToken = [];
+ foreach ($tokens as $offset => $tok) {
$currentToken[] = $tok;
- if ($tags[$offset] == self::EOW) {
- $realtokens[] = implode($this->sep,$currentToken);
- $currentToken = array();
+ if ($tags[$offset] === self::EOW) {
+ $realtokens[] = implode($this->sep, $currentToken);
+ $currentToken = [];
}
}
diff --git a/src/NlpTools/Tokenizers/PennTreeBankTokenizer.php b/src/NlpTools/Tokenizers/PennTreeBankTokenizer.php
index 0d9e33b..7514533 100644
--- a/src/NlpTools/Tokenizers/PennTreeBankTokenizer.php
+++ b/src/NlpTools/Tokenizers/PennTreeBankTokenizer.php
@@ -1,6 +1,9 @@
An array that holds the patterns and replacements
*/
- protected $patternsAndReplacements = array();
+ protected array $patternsAndReplacements = [];
public function __construct()
{
@@ -25,83 +27,81 @@ public function __construct()
/**
* Calls internal functions to handle data processing
- * @param string $str
+ *
+ * @return array
*/
- public function tokenize($str)
+ public function tokenize(string $str): array
{
return parent::tokenize($this->execute($str));
}
+
/**
* Handles the data processing
* @param string $string The raw text to get parsed
*/
- protected function execute($string)
+ protected function execute(string $string): string
{
foreach ($this->patternsAndReplacements as $patternAndReplacement) {
- $tmp = preg_replace("/".$patternAndReplacement->pattern."/s", $patternAndReplacement->replacement, $string);
+ $tmp = preg_replace("/" . $patternAndReplacement->pattern . "/s", $patternAndReplacement->replacement, $string);
if ($tmp === null) {
InvalidExpression::invalidRegex($patternAndReplacement->pattern, $patternAndReplacement->replacement);
} else {
$string = $tmp;
}
}
-
+
return $string;
}
/**
* Initializes the patterns and replacements/
*/
- protected function initPatternReplacement()
+ protected function initPatternReplacement(): void
{
$this->addPatternAndReplacement('^"', '``');
- $this->addPatternAndReplacement("\([ ([{<]\)","$1 `` ");
- $this->addPatternAndReplacement("\.\.\."," ... ");
+ $this->addPatternAndReplacement("\([ ([{<]\)", "$1 `` ");
+ $this->addPatternAndReplacement("\.\.\.", " ... ");
$this->addPatternAndReplacement("([,;:@#$%&])", " $1 ");
- $this->addPatternAndReplacement("([^.])([.])([])}>\"\']*)[ ]*$","\${1} \${2}\${3}");
- $this->addPatternAndReplacement("[?!]"," $0 ");
- $this->addPatternAndReplacement("[][(){}<>]"," $0 ");
- $this->addPatternAndReplacement("--"," -- ");
- $this->addPatternAndReplacement("\""," '' ");
+ $this->addPatternAndReplacement("([^.])([.])([])}>\"\']*)[ ]*$", "\${1} \${2}\${3}");
+ $this->addPatternAndReplacement("[?!]", " $0 ");
+ $this->addPatternAndReplacement("[][(){}<>]", " $0 ");
+ $this->addPatternAndReplacement("--", " -- ");
+ $this->addPatternAndReplacement('"', " '' ");
- $this->addPatternAndReplacement("([^'])' ","\${1} ' ");
- $this->addPatternAndReplacement("'([sSmMdD]) "," '\${1} ");
- $this->addPatternAndReplacement("'ll "," 'll ");
- $this->addPatternAndReplacement("'re "," 're ");
- $this->addPatternAndReplacement("'ve "," 've ");
- $this->addPatternAndReplacement("n't "," n't ");
- $this->addPatternAndReplacement("'LL "," 'LL ");
- $this->addPatternAndReplacement("'RE "," 'RE ");
- $this->addPatternAndReplacement("'VE "," 'VE ");
- $this->addPatternAndReplacement("N'T "," N'T ");
+ $this->addPatternAndReplacement("([^'])' ", "\${1} ' ");
+ $this->addPatternAndReplacement("'([sSmMdD]) ", " '\${1} ");
+ $this->addPatternAndReplacement("'ll ", " 'll ");
+ $this->addPatternAndReplacement("'re ", " 're ");
+ $this->addPatternAndReplacement("'ve ", " 've ");
+ $this->addPatternAndReplacement("n't ", " n't ");
+ $this->addPatternAndReplacement("'LL ", " 'LL ");
+ $this->addPatternAndReplacement("'RE ", " 'RE ");
+ $this->addPatternAndReplacement("'VE ", " 'VE ");
+ $this->addPatternAndReplacement("N'T ", " N'T ");
- $this->addPatternAndReplacement(" ([Cc])annot "," \1an not ");
- $this->addPatternAndReplacement(" ([Dd])'ye "," \${1}' ye ");
- $this->addPatternAndReplacement(" ([Gg])imme "," \${1}im me ");
- $this->addPatternAndReplacement(" ([Gg])onna "," \${1}on na ");
- $this->addPatternAndReplacement(" ([Gg])otta "," \${1}ot ta ");
- $this->addPatternAndReplacement(" ([Ll])emme "," \${1}em me ");
- $this->addPatternAndReplacement(" ([Mm])ore'n "," \${1}ore 'n ");
- $this->addPatternAndReplacement(" '([Tt])is "," '\${1} is ");
- $this->addPatternAndReplacement(" '([Tt])was "," '\${1} was ");
- $this->addPatternAndReplacement(" ([Ww])anna "," \${1}an na ");
-
- $this->addPatternAndReplacement(" *"," ");
- $this->addPatternAndReplacement("^ *","");
+ $this->addPatternAndReplacement(" ([Cc])annot ", " \1an not ");
+ $this->addPatternAndReplacement(" ([Dd])'ye ", " \${1}' ye ");
+ $this->addPatternAndReplacement(" ([Gg])imme ", " \${1}im me ");
+ $this->addPatternAndReplacement(" ([Gg])onna ", " \${1}on na ");
+ $this->addPatternAndReplacement(" ([Gg])otta ", " \${1}ot ta ");
+ $this->addPatternAndReplacement(" ([Ll])emme ", " \${1}em me ");
+ $this->addPatternAndReplacement(" ([Mm])ore'n ", " \${1}ore 'n ");
+ $this->addPatternAndReplacement(" '([Tt])is ", " '\${1} is ");
+ $this->addPatternAndReplacement(" '([Tt])was ", " '\${1} was ");
+ $this->addPatternAndReplacement(" ([Ww])anna ", " \${1}an na ");
+ $this->addPatternAndReplacement(" *", " ");
+ $this->addPatternAndReplacement("^ *", "");
}
/**
* Appends \stdClass objects to the internal data structure $patternsAndReplacements
- * @param string $pattern
- * @param string $replacement
*/
- protected function addPatternAndReplacement($pattern, $replacement)
+ protected function addPatternAndReplacement(string $pattern, string $replacement): void
{
$instance = new \stdClass();
$instance->pattern = $pattern;
$instance->replacement = $replacement;
$this->patternsAndReplacements[] = $instance;
}
-
}
diff --git a/src/NlpTools/Tokenizers/RegexTokenizer.php b/src/NlpTools/Tokenizers/RegexTokenizer.php
index 27c1832..84d4896 100644
--- a/src/NlpTools/Tokenizers/RegexTokenizer.php
+++ b/src/NlpTools/Tokenizers/RegexTokenizer.php
@@ -1,5 +1,7 @@
$patterns The regular expressions
*/
- public function __construct(array $patterns)
+ public function __construct(protected array $patterns)
{
- $this->patterns = $patterns;
}
/**
@@ -34,19 +32,22 @@ public function __construct(array $patterns)
* pattern used with preg_replace
*
* @param string $str The string to be tokenized
- * @return array The tokens
+ * @return array The tokens
*/
- public function tokenize($str)
+ public function tokenize(string $str): array
{
- $str = array($str);
- foreach ($this->patterns as $p) {
- if (!is_array($p)) $p = array($p);
- if (count($p)==1) { // split pattern
- $this->split($str, $p[0]);
- } elseif (is_int($p[1])) { // match pattern
- $this->match($str, $p[0], $p[1]);
+ $str = [$str];
+ foreach ($this->patterns as $pattern) {
+ if (!is_array($pattern)) {
+ $pattern = [$pattern];
+ }
+
+ if (count($pattern) === 1) { // split pattern
+ $this->split($str, $pattern[0]);
+ } elseif (is_int($pattern[1])) { // match pattern
+ $this->match($str, $pattern[0], (string) $pattern[1]);
} else { // replace pattern
- $this->replace($str, $p[0], $p[1]);
+ $this->replace($str, $pattern[0], $pattern[1]);
}
}
@@ -56,15 +57,15 @@ public function tokenize($str)
/**
* Execute the SPLIT mode
*
- * @param array &$str The tokens to be further tokenized
+ * @param array &$str The tokens to be further tokenized
*/
- protected function split(array &$str, $pattern)
+ protected function split(array &$str, string $pattern): void
{
- $tokens = array();
+ $tokens = [];
foreach ($str as $s) {
$tokens = array_merge(
$tokens,
- preg_split($pattern, $s, null, PREG_SPLIT_NO_EMPTY)
+ preg_split($pattern, (string) $s, -1, PREG_SPLIT_NO_EMPTY)
);
}
@@ -74,13 +75,13 @@ protected function split(array &$str, $pattern)
/**
* Execute the KEEP_MATCHES mode
*
- * @param array &$str The tokens to be further tokenized
+ * @param array &$str The tokens to be further tokenized
*/
- protected function match(array &$str, $pattern, $keep)
+ protected function match(array &$str, string $pattern, string $keep): void
{
- $tokens = array();
+ $tokens = [];
foreach ($str as $s) {
- preg_match_all($pattern, $s, $m);
+ preg_match_all($pattern, (string) $s, $m);
$tokens = array_merge(
$tokens,
$m[$keep]
@@ -93,9 +94,9 @@ protected function match(array &$str, $pattern, $keep)
/**
* Execute the TRANSFORM mode.
*
- * @param string $str The string to be tokenized
+ * @param array &$str The tokens to be further tokenized
*/
- protected function replace(array &$str, $pattern, $replacement)
+ protected function replace(array &$str, string $pattern, string $replacement): void
{
foreach ($str as &$s) {
$s = preg_replace($pattern, $replacement, $s);
diff --git a/src/NlpTools/Tokenizers/TokenizerInterface.php b/src/NlpTools/Tokenizers/TokenizerInterface.php
index 99dbf74..3aae379 100644
--- a/src/NlpTools/Tokenizers/TokenizerInterface.php
+++ b/src/NlpTools/Tokenizers/TokenizerInterface.php
@@ -1,5 +1,7 @@
The list of tokens from the string
*/
- public function tokenize($str);
+ public function tokenize(string $str): array;
}
diff --git a/src/NlpTools/Tokenizers/WhitespaceAndPunctuationTokenizer.php b/src/NlpTools/Tokenizers/WhitespaceAndPunctuationTokenizer.php
index e351418..9a55909 100644
--- a/src/NlpTools/Tokenizers/WhitespaceAndPunctuationTokenizer.php
+++ b/src/NlpTools/Tokenizers/WhitespaceAndPunctuationTokenizer.php
@@ -1,5 +1,7 @@
+ */
+ protected array $transforms;
- protected $transforms;
- protected $classes = array();
+ /**
+ * @var array
+ */
+ protected array $classes = [];
/**
* In order to classify anything with NlpTools we need something
* that implements the ClassifierInterface. We also need the set
* of classes but that will be calculated by the classes for which
* we register a transformation.
- *
- * @param ClassifierInterface $cls
*/
- public function __construct(ClassifierInterface $cls)
+ public function __construct(protected ClassifierInterface $classifier)
{
- $this->cls = $cls;
}
/**
* Classify the passed in variable w and then apply each transformation
* to the output of the previous one.
*/
- public function transform($w)
+ public function transform(string $w): string
{
- $class = $this->cls->classify(
+ $class = $this->classifier->classify(
$this->classes,
new RawDocument($w)
);
@@ -52,14 +56,14 @@ public function transform($w)
/**
* Register a set of transformations for a given class.
*
- * @param string $class
- * @param array|TransformationInterface Either an array of transformations or a single transformation
+ * @param array|TransformationInterface $transforms Either an array of transformations or a single transformation
*/
- public function register($class, $transforms)
+ public function register(string $class, array|TransformationInterface $transforms): void
{
if (!is_array($transforms)) {
- $transforms = array($transforms);
+ $transforms = [$transforms];
}
+
foreach ($transforms as $t) {
if (!($t instanceof TransformationInterface)) {
throw new \InvalidArgumentException("Only instances of TransformationInterface can be registered");
@@ -68,11 +72,11 @@ public function register($class, $transforms)
if (!isset($this->transforms[$class])) {
$this->classes[] = $class;
- $this->transforms[$class] = array();
+ $this->transforms[$class] = [];
}
- foreach ($transforms as $t) {
- $this->transforms[$class][] = $t;
+ foreach ($transforms as $transform) {
+ $this->transforms[$class][] = $transform;
}
}
}
diff --git a/src/NlpTools/Utils/EnglishVowels.php b/src/NlpTools/Utils/EnglishVowels.php
index 1b2779f..e281198 100644
--- a/src/NlpTools/Utils/EnglishVowels.php
+++ b/src/NlpTools/Utils/EnglishVowels.php
@@ -1,4 +1,7 @@
+ */
+ protected static array $dirty = ['ά', 'έ', 'ό', 'ή', 'ί', 'ύ', 'ώ', 'ς'];
+
+ /**
+ * @var array
+ */
+ protected static array $clean = ['α', 'ε', 'ο', 'η', 'ι', 'υ', 'ω', 'σ'];
- public function normalize($w)
+ public function normalize(string $w): string
{
return str_replace(self::$dirty, self::$clean, mb_strtolower($w, "utf-8"));
}
diff --git a/src/NlpTools/Utils/Normalizers/Normalizer.php b/src/NlpTools/Utils/Normalizers/Normalizer.php
index 094a16d..393446d 100644
--- a/src/NlpTools/Utils/Normalizers/Normalizer.php
+++ b/src/NlpTools/Utils/Normalizers/Normalizer.php
@@ -1,5 +1,7 @@
normalize($w);
}
/**
* Apply the normalize function to all the items in the array
- * @param array $items
- * @return array
+ *
+ * @param array $items
+ * @return array
*/
- public function normalizeAll(array $items)
+ public function normalizeAll(array $items): array
{
return array_map(
- array($this, 'normalize'),
+ $this->normalize(...),
$items
);
}
@@ -54,12 +56,10 @@ public function normalizeAll(array $items)
* Just instantiate the normalizer using a factory method.
* Keep in mind that this is NOT required. The constructor IS
* visible.
- *
- * @param string $language
*/
- public static function factory($language = "English")
+ public static function factory(string $language = "English"): self
{
- $classname = __NAMESPACE__."\\$language";
+ $classname = __NAMESPACE__ . ('\\' . $language);
return new $classname();
}
diff --git a/src/NlpTools/Utils/StopWords.php b/src/NlpTools/Utils/StopWords.php
index e34f60f..8a606ed 100644
--- a/src/NlpTools/Utils/StopWords.php
+++ b/src/NlpTools/Utils/StopWords.php
@@ -1,5 +1,7 @@
+ */
+ protected array $stopwords;
- public function __construct(array $stopwords, TransformationInterface $transform = null)
+ /**
+ * @param array $stopwords
+ */
+ public function __construct(array $stopwords, protected ?TransformationInterface $transformation = null)
{
$this->stopwords = array_fill_keys(
$stopwords,
true
);
-
- $this->inner_transform = $transform;
}
- public function transform($token)
+ public function transform(string $token): ?string
{
$tocheck = $token;
- if ($this->inner_transform) {
- $tocheck = $this->inner_transform->transform($token);
+ if ($this->transformation instanceof TransformationInterface) {
+ $tocheck = $this->transformation->transform($token);
}
return isset($this->stopwords[$tocheck]) ? null : $token;
diff --git a/src/NlpTools/Utils/TransformationInterface.php b/src/NlpTools/Utils/TransformationInterface.php
index ae11d51..3f0964b 100644
--- a/src/NlpTools/Utils/TransformationInterface.php
+++ b/src/NlpTools/Utils/TransformationInterface.php
@@ -1,5 +1,7 @@
assertTrue(count($freqDist->getHapaxes()) === 3);
+class FreqDistTest extends TestCase
+{
+ public function testSimpleFreqDist(): void
+ {
+ $freqDist = new FreqDist(["time", "flies", "like", "an", "arrow", "time", "flies", "like", "what"]);
+ $this->assertTrue(count($freqDist->getHapaxes()) === 3);
$this->assertEquals(9, $freqDist->getTotalTokens());
$this->assertEquals(6, $freqDist->getTotalUniqueTokens());
}
- public function testSimpleFreqWeight()
- {
- $freqDist = new FreqDist(array("time", "flies", "like", "an", "arrow", "time", "flies", "like", "what"));
+ public function testSimpleFreqWeight(): void
+ {
+ $freqDist = new FreqDist(["time", "flies", "like", "an", "arrow", "time", "flies", "like", "what"]);
$this->assertEquals(1, $freqDist->getTotalByToken('an'));
- $this->assertEquals(0.111, $freqDist->getTokenWeight('an'));
+ $this->assertEquals(0.111, round($freqDist->getTokenWeight('an'), 3));
}
-
- public function testEmptyHapaxesFreqDist()
- {
- $freqDist = new FreqDist(array("time", "time", "what", "what"));
- $this->assertTrue(count($freqDist->getHapaxes()) === 0);
+
+ public function testEmptyHapaxesFreqDist(): void
+ {
+ $freqDist = new FreqDist(["time", "time", "what", "what"]);
+ $this->assertTrue($freqDist->getHapaxes() === []);
$this->assertEquals(4, $freqDist->getTotalTokens());
$this->assertEquals(2, $freqDist->getTotalUniqueTokens());
}
-
- public function testSingleHapaxFreqDist()
+
+ public function testSingleHapaxFreqDist(): void
{
- $freqDist = new FreqDist(array("time"));
- $this->assertTrue(count($freqDist->getHapaxes()) === 1);
+ $freqDist = new FreqDist(["time"]);
+ $this->assertTrue(count($freqDist->getHapaxes()) === 1);
$this->assertEquals(1, $freqDist->getTotalTokens());
- $this->assertEquals(1, $freqDist->getTotalUniqueTokens());
+ $this->assertEquals(1, $freqDist->getTotalUniqueTokens());
}
}
-
diff --git a/tests/NlpTools/Analysis/IdfTest.php b/tests/NlpTools/Analysis/IdfTest.php
index 377eeee..9abc55f 100644
--- a/tests/NlpTools/Analysis/IdfTest.php
+++ b/tests/NlpTools/Analysis/IdfTest.php
@@ -1,47 +1,44 @@
addDocument(
+ $trainingSet = new TrainingSet();
+ $trainingSet->addDocument(
"",
- new TokensDocument(array("a","b","c","d"))
+ new TokensDocument(["a", "b", "c", "d"])
);
- $ts->addDocument(
+ $trainingSet->addDocument(
"",
- new TokensDocument(array("a","c","d"))
+ new TokensDocument(["a", "c", "d"])
);
- $ts->addDocument(
+ $trainingSet->addDocument(
"",
- new TokensDocument(array("a"))
+ new TokensDocument(["a"])
);
- $idf = new Idf($ts);
+ $idf = new Idf($trainingSet);
$this->assertEquals(
0.405,
- $idf["c"],
- null,
- 0.001
+ round($idf["c"], 3),
);
$this->assertEquals(
- 1.098,
- $idf["b"],
- null,
- 0.001
+ 1.099,
+ round($idf["b"], 3),
);
$this->assertEquals(
- 1.098,
- $idf["non-existing"],
- null,
- 0.001
+ 1.099,
+ round($idf["non-existing"], 3),
);
$this->assertEquals(
0,
diff --git a/tests/NlpTools/Classifiers/EndOfSentenceRules.php b/tests/NlpTools/Classifiers/EndOfSentenceRules.php
index e8b7f3d..9733d4a 100644
--- a/tests/NlpTools/Classifiers/EndOfSentenceRules.php
+++ b/tests/NlpTools/Classifiers/EndOfSentenceRules.php
@@ -1,23 +1,29 @@
getDocumentData();
+ [$token, $before, $after] = $document->getDocumentData();
- $dotcnt = count(explode('.',$token))-1;
- $lastdot = substr($token,-1)=='.';
+ $dotcnt = count(explode('.', (string) $token)) - 1;
+ $lastdot = str_ends_with((string) $token, '.');
- if (!$lastdot) // assume that all sentences end in full stops
+ if (!$lastdot) {
+ // assume that all sentences end in full stops
return 'O';
+ }
- if ($dotcnt>1) // to catch some naive abbreviations (e.g.: U.S.A.)
+ if ($dotcnt > 1) {
+ // to catch some naive abbreviations (e.g.: U.S.A.)
return 'O';
+ }
return 'EOW';
}
diff --git a/tests/NlpTools/Clustering/ClusteringTestBase.php b/tests/NlpTools/Clustering/ClusteringTestBase.php
index 5e694d9..d81f880 100644
--- a/tests/NlpTools/Clustering/ClusteringTestBase.php
+++ b/tests/NlpTools/Clustering/ClusteringTestBase.php
@@ -1,62 +1,71 @@
*/
- protected function getColor($t)
+ protected function getColor(float $t): array
{
- $u = function ($x) { return ($x>0) ? 1 : 0; };
- $pulse = function ($x,$a,$b) use ($u) { return $u($x-$a)-$u($x-$b); };
-
- return array(
- (int) ( 255*( $pulse($t,0,1/3) + $pulse($t,1/3,2/3)*(2-3*$t) ) ),
- (int) ( 255*( $pulse($t,0,1/3)*3*$t + $pulse($t,1/3,2/3) + $pulse($t,2/3,1)*(3-3*$t) ) ),
- (int) ( 255*( $pulse($t,1/3,2/3)*(3*$t-1) + $pulse($t,2/3,1) ) )
- );
+ $u = fn($x): int => ($x > 0) ? 1 : 0;
+ $pulse = fn($x, $a, $b): int => $u($x - $a) - $u($x - $b);
+
+ return [(int) ( 255 * ( $pulse($t, 0, 1 / 3) + $pulse($t, 1 / 3, 2 / 3) * (2 - 3 * $t) ) ), (int) ( 255 * ( $pulse($t, 0, 1 / 3) * 3 * $t + $pulse($t, 1 / 3, 2 / 3) + $pulse($t, 2 / 3, 1) * (3 - 3 * $t) ) ), (int) ( 255 * ( $pulse($t, 1 / 3, 2 / 3) * (3 * $t - 1) + $pulse($t, 2 / 3, 1) ) )];
}
/**
* Return a gd handle with a visualization of the clustering or null in case gd is not present.
+ *
+ * @param array $clusters
+ * @param array|null $centroids
*/
- protected function drawClusters($tset, $clusters, $centroids=null, $lines=False,$emphasize=0,$w=300,$h=200)
+ protected function drawClusters(TrainingSet $trainingSet, array $clusters, ?array $centroids = null, bool $lines = false, int $emphasize = 0, int $w = 300, int $h = 200): mixed
{
- if (!function_exists('imagecreate'))
+ if (!function_exists('imagecreate')) {
return null;
+ }
- $im = imagecreatetruecolor($w,$h);
- $white = imagecolorallocate($im,255,255,255);
- $colors = array();
- $NC = count($clusters);
- for ($i=1;$i<=$NC;$i++) {
- list($r,$g,$b) = $this->getColor($i/$NC);
- $colors[] = imagecolorallocate($im,$r,$g,$b);
+ $im = imagecreatetruecolor($w, $h);
+ $white = imagecolorallocate($im, 255, 255, 255);
+ $colors = [];
+ $numberOfClusters = count($clusters);
+ for ($i = 1; $i <= $numberOfClusters; $i++) {
+ [$r, $g, $b] = $this->getColor($i / $numberOfClusters);
+ $colors[] = imagecolorallocate($im, $r, $g, $b);
}
- imagefill($im,0,0,$white);
- foreach ($clusters as $cid=>$cluster) {
+ imagefill($im, 0, 0, $white);
+ foreach ($clusters as $cid => $cluster) {
foreach ($cluster as $idx) {
- $data = $tset[$idx]->getDocumentData();
- if ($emphasize>0)
- imagefilledarc($im,$data['x'],$data['y'],$emphasize,$emphasize,0,360,$colors[$cid],0);
- else
- imagesetpixel($im,$data['x'],$data['y'],$colors[$cid]);
+ $data = $trainingSet[$idx]->getDocumentData();
+ if ($emphasize > 0) {
+ imagefilledarc($im, $data['x'], $data['y'], $emphasize, $emphasize, 0, 360, $colors[$cid], 0);
+ } else {
+ imagesetpixel($im, $data['x'], $data['y'], $colors[$cid]);
+ }
}
+
if (is_array($centroids)) {
$x = $centroids[$cid]['x'];
$y = $centroids[$cid]['y'];
if ($lines) {
// draw line
// for cosine similarity
- imagesetthickness($im,5);
- imageline($im,0,0,$x*400,$y*400,$colors[$cid]);
+ imagesetthickness($im, 5);
+ imageline($im, 0, 0, $x * 400, $y * 400, $colors[$cid]);
} else {
// draw circle for euclidean
- imagefilledarc($im,$x,$y,10,10,0,360,$colors[$cid],0);
+ imagefilledarc($im, (int) $x, (int) $y, 10, 10, 0, 360, $colors[$cid], 0);
}
}
}
@@ -67,23 +76,26 @@ protected function drawClusters($tset, $clusters, $centroids=null, $lines=False,
/**
* Return a gd handle with a visualization of the given dendrogram or null
* if gd is not present.
+ *
+ * @param array $dendrogram
*/
- protected function drawDendrogram($tset, $dendrogram, $w=300, $h=200)
+ protected function drawDendrogram(TrainingSet $trainingSet, array $dendrogram, int $w = 300, int $h = 200): mixed
{
- if (!function_exists('imagecreate'))
+ if (!function_exists('imagecreate')) {
return null;
+ }
- $im = imagecreatetruecolor($w,$h);
- $white = imagecolorallocate($im, 255,255,255);
- $black = imagecolorallocate($im, 0,0,0);
- $blue = imagecolorallocate($im, 0,0,255);
- imagefill($im, 0,0, $white);
+ $im = imagecreatetruecolor($w, $h);
+ $white = imagecolorallocate($im, 255, 255, 255);
+ $black = imagecolorallocate($im, 0, 0, 0);
+ $blue = imagecolorallocate($im, 0, 0, 255);
+ imagefill($im, 0, 0, $white);
// padding 5%
- $padding = round(0.05*$w);
+ $padding = round(0.05 * $w);
// equally distribute
- $d = ($w-2*$padding)/count($tset);
- $count_depth = function ($a) use (&$depth, &$count_depth) {
+ $d = ($w - 2 * $padding) / count($trainingSet);
+ $count_depth = function ($a) use (&$count_depth): int|float {
if (is_array($a)) {
return max(
array_map(
@@ -91,38 +103,40 @@ protected function drawDendrogram($tset, $dendrogram, $w=300, $h=200)
$a
)
) + 1;
- } else {
- return 1;
}
+
+ return 1;
};
- $depth = $count_depth($dendrogram)-1;
- $d_v = ($h-2*$padding)/$depth;
+ $depth = $count_depth($dendrogram) - 1;
+ $d_v = ($h - 2 * $padding) / $depth;
// offset from bottom
- $y = $h-$padding;
+ $y = $h - $padding;
$left = $padding;
- $draw_subcluster = function ($dendrogram, &$left) use (&$im, $d, $y, $d_v, $black, &$draw_subcluster,$blue) {
+ $draw_subcluster = function ($dendrogram, &$left) use (&$im, $d, $y, $d_v, $black, &$draw_subcluster, $blue): array {
if (!is_array($dendrogram)) {
- imagestring($im, 1, $left-(2 * strlen($dendrogram)), $y, $dendrogram, $black);
+ imagestring($im, 1, (int) ($left - (2 * strlen((string) $dendrogram))), (int) $y, (string) $dendrogram, $black);
$left += $d;
- return array($left - $d,$y-5);
+ return [$left - $d, $y - 5];
}
- list($l,$yl) = $draw_subcluster($dendrogram[0],$left);
- list($r,$yr) = $draw_subcluster($dendrogram[1],$left);
- $ym = min($yl,$yr)-$d_v;
- imageline($im, $l, $yl, $l, $ym, $blue);
- imageline($im, $r, $yr, $r, $ym, $blue);
- imageline($im, $l, $ym, $r, $ym, $blue);
-
- return array($l+($r-$l)/2,$ym);
+
+ [$l, $yl] = $draw_subcluster($dendrogram[0], $left);
+ [$r, $yr] = $draw_subcluster($dendrogram[1], $left);
+ $ym = min($yl, $yr) - $d_v;
+ imageline($im, (int) $l, (int) $yl, (int) $l, (int) $ym, $blue);
+ imageline($im, (int) $r, (int) $yr, (int) $r, (int) $ym, $blue);
+ imageline($im, (int) $l, (int) $ym, (int) $r, (int) $ym, $blue);
+
+ return [$l + ($r - $l) / 2, $ym];
};
- if (count($dendrogram)==1)
- $draw_subcluster($dendrogram[0],$left);
- else
- $draw_subcluster($dendrogram,$left);
+ if (count($dendrogram) === 1) {
+ $draw_subcluster($dendrogram[0], $left);
+ } else {
+ $draw_subcluster($dendrogram, $left);
+ }
return $im;
}
diff --git a/tests/NlpTools/Clustering/HierarchicalTest.php b/tests/NlpTools/Clustering/HierarchicalTest.php
index 467b43d..430cf89 100644
--- a/tests/NlpTools/Clustering/HierarchicalTest.php
+++ b/tests/NlpTools/Clustering/HierarchicalTest.php
@@ -1,5 +1,7 @@
0,'y'=>0),
- array('x'=>0,'y'=>1),
- array('x'=>1,'y'=>3),
- array('x'=>4,'y'=>6),
- array('x'=>6,'y'=>6)
- );
+ $docs = [['x' => 0, 'y' => 0], ['x' => 0, 'y' => 1], ['x' => 1, 'y' => 3], ['x' => 4, 'y' => 6], ['x' => 6, 'y' => 6]];
- $sl = new SingleLink();
- $sl->initializeStrategy(new Euclidean(), $docs);
+ $singleLink = new SingleLink();
+ $singleLink->initializeStrategy(new Euclidean(), $docs);
- $pair = $sl->getNextMerge();
+ $pair = $singleLink->getNextMerge();
$this->assertEquals(
- array(0,1),
+ [0, 1],
$pair
);
- $pair = $sl->getNextMerge();
+ $pair = $singleLink->getNextMerge();
$this->assertEquals(
- array(3,4),
+ [3, 4],
$pair
);
- $pair = $sl->getNextMerge();
+ $pair = $singleLink->getNextMerge();
$this->assertEquals(
- array(0,2),
+ [0, 2],
$pair
);
- $pair = $sl->getNextMerge();
+ $pair = $singleLink->getNextMerge();
$this->assertEquals(
- array(0,3),
+ [0, 3],
$pair
);
- $this->setExpectedException(
- "RuntimeException",
- "Can't extract from an empty heap"
- );
- $sl->getNextMerge();
+ $this->expectException(\RuntimeException::class);
+ $singleLink->getNextMerge();
}
/**
@@ -88,55 +83,45 @@ public function testSingleLink()
* 0 1 2 3 4 7
*
*/
- public function testCompleteLink()
+ public function testCompleteLink(): void
{
- $docs = array(
- array('x'=>0,'y'=>1),
- array('x'=>1,'y'=>1),
- array('x'=>2,'y'=>1),
- array('x'=>3,'y'=>1),
- array('x'=>4,'y'=>1),
- array('x'=>7,'y'=>1)
- );
+ $docs = [['x' => 0, 'y' => 1], ['x' => 1, 'y' => 1], ['x' => 2, 'y' => 1], ['x' => 3, 'y' => 1], ['x' => 4, 'y' => 1], ['x' => 7, 'y' => 1]];
- $cl = new CompleteLink();
- $cl->initializeStrategy(new Euclidean(), $docs);
+ $completeLink = new CompleteLink();
+ $completeLink->initializeStrategy(new Euclidean(), $docs);
- $pair = $cl->getNextMerge();
+ $pair = $completeLink->getNextMerge();
$this->assertEquals(
- array(0,1),
+ [0, 1],
$pair
);
- $pair = $cl->getNextMerge();
+ $pair = $completeLink->getNextMerge();
$this->assertEquals(
- array(2,3),
+ [2, 3],
$pair
);
- $pair = $cl->getNextMerge();
+ $pair = $completeLink->getNextMerge();
$this->assertEquals(
- array(2,4),
+ [2, 4],
$pair
);
- $pair = $cl->getNextMerge();
+ $pair = $completeLink->getNextMerge();
$this->assertEquals(
- array(0,2),
+ [0, 2],
$pair
);
- $pair = $cl->getNextMerge();
+ $pair = $completeLink->getNextMerge();
$this->assertEquals(
- array(0,5),
+ [0, 5],
$pair
);
- $this->setExpectedException(
- "RuntimeException",
- "Can't extract from an empty heap"
- );
- $cl->getNextMerge();
+ $this->expectException(\RuntimeException::class);
+ $completeLink->getNextMerge();
}
/**
@@ -176,177 +161,151 @@ public function testCompleteLink()
* because the distance between the groups {0,1}-{2,3} is 2 and {2,3},{4.5} is also 2.
*
*/
- public function testGroupAverage()
+ public function testGroupAverage(): void
{
- $docs = array(
- array('x'=>0,'y'=>1),
- array('x'=>1,'y'=>1),
- array('x'=>2,'y'=>1),
- array('x'=>3,'y'=>1),
- array('x'=>4.51,'y'=>1),
- );
+ $docs = [['x' => 0, 'y' => 1], ['x' => 1, 'y' => 1], ['x' => 2, 'y' => 1], ['x' => 3, 'y' => 1], ['x' => 4.51, 'y' => 1]];
- $ga = new GroupAverage();
- $ga->initializeStrategy(new Euclidean(), $docs);
+ $groupAverage = new GroupAverage();
+ $groupAverage->initializeStrategy(new Euclidean(), $docs);
- $pair = $ga->getNextMerge();
+ $pair = $groupAverage->getNextMerge();
$this->assertEquals(
- array(0,1),
+ [0, 1],
$pair
);
- $pair = $ga->getNextMerge();
+ $pair = $groupAverage->getNextMerge();
$this->assertEquals(
- array(2,3),
+ [2, 3],
$pair
);
- $pair = $ga->getNextMerge();
+ $pair = $groupAverage->getNextMerge();
$this->assertEquals(
- array(0,2),
+ [0, 2],
$pair
);
- $pair = $ga->getNextMerge();
+ $pair = $groupAverage->getNextMerge();
$this->assertEquals(
- array(0,4),
+ [0, 4],
$pair
);
- $docs[4] = array('x'=>4.49,'y'=>1);
- $ga->initializeStrategy(new Euclidean(), $docs);
+ $docs[4] = ['x' => 4.49, 'y' => 1];
+ $groupAverage->initializeStrategy(new Euclidean(), $docs);
- $pair = $ga->getNextMerge();
+ $pair = $groupAverage->getNextMerge();
$this->assertEquals(
- array(0,1),
+ [0, 1],
$pair
);
- $pair = $ga->getNextMerge();
+ $pair = $groupAverage->getNextMerge();
$this->assertEquals(
- array(2,3),
+ [2, 3],
$pair
);
- $pair = $ga->getNextMerge();
+ $pair = $groupAverage->getNextMerge();
$this->assertEquals(
- array(2,4),
+ [2, 4],
$pair
);
- $pair = $ga->getNextMerge();
+ $pair = $groupAverage->getNextMerge();
$this->assertEquals(
- array(0,2),
+ [0, 2],
$pair
);
}
- public function testDendrogramToClusters()
+ public function testDendrogramToClusters(): void
{
- $dendrograms = array(
- array(
- array(array(0,1),array(array(2,3),4)),
- array(array(0,1),array(2,3,4))
- ),
- array(
- array(array(0,array(1,array(2,array(3,array(4,array(5,array(6,7)))))))),
- array(array(0),array(1),array(2),array(3,4,5,6,7))
- )
- );
+ $dendrograms = [[[[0, 1], [[2, 3], 4]], [[0, 1], [2, 3, 4]]], [[[0, [1, [2, [3, [4, [5, [6, 7]]]]]]]], [[0], [1], [2], [3, 4, 5, 6, 7]]]];
- foreach ($dendrograms as $i=>$d) {
+ foreach ($dendrograms as $i => $d) {
$this->assertEquals(
$d[1],
Hierarchical::dendrogramToClusters(
$d[0],
count($d[1])
),
- "Error transforming dendrogram $i"
+ 'Error transforming dendrogram ' . $i
);
}
}
- public function testClustering1()
+ public function testClustering1(): void
{
- $points = array(
- array('x'=>1, 'y'=>1),
- array('x'=>1, 'y'=>2),
- array('x'=>2, 'y'=>2),
- array('x'=>3, 'y'=>3),
- array('x'=>3, 'y'=>4),
- );
+ $points = [['x' => 1, 'y' => 1], ['x' => 1, 'y' => 2], ['x' => 2, 'y' => 2], ['x' => 3, 'y' => 3], ['x' => 3, 'y' => 4]];
+
+ $trainingSet = new TrainingSet();
- $tset = new TrainingSet();
- foreach ($points as $p)
- $tset->addDocument('',new TokensDocument($p));
+ foreach ($points as $point) {
+ $trainingSet->addDocument('', new TokensDocument($point));
+ }
- $hc = new Hierarchical(
+ $hierarchical = new Hierarchical(
new SingleLink(), // use the single link strategy
new Euclidean() // with euclidean distance
);
- list($dendrogram) = $hc->cluster($tset,new DataAsFeatures());
+ [$dendrogram] = $hierarchical->cluster($trainingSet, new DataAsFeatures());
$this->assertEquals(
- array(
- array(
- array(
- array(
- 0,
- 1
- ),
- 2
- ),
- array(
- 3,
- 4
- )
- )
- ),
+ [[[[0, 1], 2], [3, 4]]],
$dendrogram
);
}
- public function testClustering2()
+ public function testClustering2(): void
{
$N = 50;
- $tset = new TrainingSet();
- for ($i=0;$i<$N;$i++) {
- $tset->addDocument(
+ $trainingSet = new TrainingSet();
+ for ($i = 0; $i < $N; $i++) {
+ $trainingSet->addDocument(
'',
- EuclideanPoint::getRandomPointAround(100,100,45)
+ EuclideanPoint::getRandomPointAround(100, 100, 45)
);
}
- for ($i=0;$i<$N;$i++) {
- $tset->addDocument(
+
+ for ($i = 0; $i < $N; $i++) {
+ $trainingSet->addDocument(
'',
- EuclideanPoint::getRandomPointAround(200,100,45)
+ EuclideanPoint::getRandomPointAround(200, 100, 45)
);
}
- $hc = new Hierarchical(
+ $hierarchical = new Hierarchical(
new SingleLink(), // use the single link strategy
new Euclidean() // with euclidean distance
);
- list($dendrogram) = $hc->cluster($tset,new DataAsFeatures());
+ [$dendrogram] = $hierarchical->cluster($trainingSet, new DataAsFeatures());
$dg = $this->drawDendrogram(
- $tset,
+ $trainingSet,
$dendrogram,
600 // width
);
- $clusters = Hierarchical::dendrogramToClusters($dendrogram,2);
+ $clusters = Hierarchical::dendrogramToClusters($dendrogram, 2);
$im = $this->drawClusters(
- $tset,
+ $trainingSet,
$clusters,
null, // no centroids
false, // no lines
10 // emphasize points (for little points)
);
- if ($dg)
- imagepng($dg, TEST_DATA_DIR."/Clustering/HierarchicalTest/dendrogram.png");
- if ($im)
- imagepng($im, TEST_DATA_DIR."/Clustering/HierarchicalTest/clusters.png");
+ if ($dg !== null) {
+ imagepng($dg, TEST_DATA_DIR . "/Clustering/HierarchicalTest/dendrogram.png");
+ }
+
+ if ($im !== null) {
+ imagepng($im, TEST_DATA_DIR . "/Clustering/HierarchicalTest/clusters.png");
+ }
+
+ // should have proper assertions at some point
+ $this->assertTrue(true);
}
}
diff --git a/tests/NlpTools/Clustering/KmeansTest.php b/tests/NlpTools/Clustering/KmeansTest.php
index 78e94b3..e5efb23 100644
--- a/tests/NlpTools/Clustering/KmeansTest.php
+++ b/tests/NlpTools/Clustering/KmeansTest.php
@@ -1,5 +1,7 @@
addDocument(
+ $trainingSet = new TrainingSet();
+ for ($i = 0; $i < 500; $i++) {
+ $trainingSet->addDocument(
'A',
- EuclideanPoint::getRandomPointAround(100,100,45)
+ EuclideanPoint::getRandomPointAround(100, 100, 45)
);
}
- for ($i=0;$i<500;$i++) {
- $tset->addDocument(
+
+ for ($i = 0; $i < 500; $i++) {
+ $trainingSet->addDocument(
'B',
- EuclideanPoint::getRandomPointAround(200,100,45)
+ EuclideanPoint::getRandomPointAround(200, 100, 45)
);
}
- list($clusters,$centroids,$distances) = $clust->cluster($tset,new DataAsFeatures());
+ [$clusters, $centroids, $distances] = $kMeans->cluster($trainingSet, new DataAsFeatures());
$im = $this->drawClusters(
- $tset,
+ $trainingSet,
$clusters,
$centroids,
false // lines or not
);
- if ($im)
- imagepng($im,TEST_DATA_DIR."/Clustering/KmeansTest/clusters.png");
+ if ($im !== null && $im !== false) {
+ imagepng($im, TEST_DATA_DIR . "/Clustering/KmeansTest/clusters.png");
+ }
// since the dataset is artificial and clearly separated, the kmeans
// algorithm should always cluster it correctly
- foreach ($clusters as $clust) {
- $classes = array();
- foreach ($clust as $point_idx) {
- $class = $tset[$point_idx]->getClass();
- if (!isset($classes[$class]))
+ foreach ($clusters as $cluster) {
+ $classes = [];
+ foreach ($cluster as $point_idx) {
+ $class = $trainingSet[$point_idx]->getClass();
+ if (!isset($classes[$class])) {
$classes[$class] = true;
+ }
}
+
// assert that all the documents (points) in this cluster belong
// in the same class
$this->assertCount(
diff --git a/tests/NlpTools/Documents/EuclideanPoint.php b/tests/NlpTools/Documents/EuclideanPoint.php
index 1a12d82..460109d 100644
--- a/tests/NlpTools/Documents/EuclideanPoint.php
+++ b/tests/NlpTools/Documents/EuclideanPoint.php
@@ -1,38 +1,41 @@
x = $x;
- $this->y = $y;
}
- public function getDocumentData()
+
+ /**
+ * @return array
+ */
+ public function getDocumentData(): array
{
- return array(
- 'x'=>$this->x,
- 'y'=>$this->y
- );
+ return ['x' => $this->x, 'y' => $this->y];
}
- public static function getRandomPointAround($x,$y,$R)
+ public static function getRandomPointAround(int $x, int $y, int $R): EuclideanPoint
{
return new EuclideanPoint(
- $x+mt_rand(-$R,$R),
- $y+mt_rand(-$R,$R)
+ $x + mt_rand(-$R, $R),
+ $y + mt_rand(-$R, $R)
);
}
- public function applyTransformation(TransformationInterface $transform)
+ public function applyTransformation(TransformationInterface $transformation): void
+ {
+ $this->x = (int) $transformation->transform((string) $this->x);
+ $this->y = (int) $transformation->transform((string) $this->y);
+ }
+
+ public function getClass(): string
{
- $this->x = $transform->transform($this->x);
- $this->y = $transform->transform($this->y);
+ return self::class;
}
}
diff --git a/tests/NlpTools/Documents/TransformationsTest.php b/tests/NlpTools/Documents/TransformationsTest.php
index 2822870..b298d18 100644
--- a/tests/NlpTools/Documents/TransformationsTest.php
+++ b/tests/NlpTools/Documents/TransformationsTest.php
@@ -1,62 +1,72 @@
+ */
+ public static function provideTokens(): array
{
- return array(
- array(array("1","2","3","4","5","6","7"))
- );
+ return [[["1", "2", "3", "4", "5", "6", "7"]]];
}
/**
- * @dataProvider provideTokens
+ * @param array $tokens
*/
- public function testTokensDocument($tokens)
+ #[DataProvider('provideTokens')]
+ public function testTokensDocument(array $tokens): void
{
- $doc = new TokensDocument($tokens);
- $transformer = new IdentityTransformer();
+ $tokensDocument = new TokensDocument($tokens);
+ $identityTransformer = new IdentityTransformer();
$this->assertEquals(
$tokens,
- $doc->getDocumentData()
+ $tokensDocument->getDocumentData()
);
- $doc->applyTransformation($transformer);
+ $tokensDocument->applyTransformation($identityTransformer);
$this->assertEquals(
$tokens,
- $doc->getDocumentData()
+ $tokensDocument->getDocumentData()
);
- $tdoc = new TrainingDocument("", new TokensDocument($tokens));
- $tdoc->applyTransformation($transformer);
+ $trainingDocument = new TrainingDocument("", new TokensDocument($tokens));
+ $trainingDocument->applyTransformation($identityTransformer);
$this->assertEquals(
$tokens,
- $tdoc->getDocumentData()
+ $trainingDocument->getDocumentData()
);
}
/**
- * @dataProvider provideTokens
+ * @param array $tokens
*/
- public function testWordDocument($tokens)
+ #[DataProvider('provideTokens')]
+ public function testWordDocument(array $tokens): void
{
- $transformer = new IdentityTransformer();
- $doc = new WordDocument($tokens,count($tokens)/2, 2);
- $correct = $doc->getDocumentData();
- $doc->applyTransformation($transformer);
+ $identityTransformer = new IdentityTransformer();
+ $wordDocument = new WordDocument($tokens, (int) (count($tokens) / 2), 2);
+ $correct = $wordDocument->getDocumentData();
+ $wordDocument->applyTransformation($identityTransformer);
$this->assertEquals(
$correct,
- $doc->getDocumentData()
+ $wordDocument->getDocumentData()
);
- $tdoc = new TrainingDocument("", new WordDocument($tokens,count($tokens)/2, 2));
- $tdoc->applyTransformation($transformer);
+ $trainingDocument = new TrainingDocument("", new WordDocument($tokens, (int) (count($tokens) / 2), 2));
+ $trainingDocument->applyTransformation($identityTransformer);
$this->assertEquals(
$correct,
- $tdoc->getDocumentData()
+ $trainingDocument->getDocumentData()
);
}
}
diff --git a/tests/NlpTools/Documents/WordDocumentTest.php b/tests/NlpTools/Documents/WordDocumentTest.php
index 87066a0..9927abf 100644
--- a/tests/NlpTools/Documents/WordDocumentTest.php
+++ b/tests/NlpTools/Documents/WordDocumentTest.php
@@ -1,33 +1,40 @@
+ */
+ protected array $tokens;
- public function __construct()
+ protected function setUp(): void
{
- $this->tokens = array("The","quick","brown","fox","jumped","over","the","lazy","dog");
+ $this->tokens = ["The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog"];
}
/**
* Test that the WordDocument correctly represents the ith token
*/
- public function testTokenSelection()
+ public function testTokenSelection(): void
{
- foreach ($this->tokens as $i=>$t) {
+ foreach ($this->tokens as $i => $t) {
// no context
$doc = new WordDocument($this->tokens, $i, 0);
- list($w,$prev,$next) = $doc->getDocumentData();
+ [$w, $prev, $next] = $doc->getDocumentData();
$this->assertEquals(
$t,
$w,
- "The {$i}th token should be $t not $w"
+ sprintf('The %sth token should be %s not %s', $i, $t, $w)
);
// no context means prev,next are empty
@@ -47,21 +54,22 @@ public function testTokenSelection()
* until it reaches the edges of the token list. Check the
* previous tokens.
*/
- public function testPrevContext()
+ public function testPrevContext(): void
{
- for ($i=0;$i<5;$i++) {
+ for ($i = 0; $i < 5; $i++) {
$doc = new WordDocument($this->tokens, 4, $i);
- list($_,$prev,$_) = $doc->getDocumentData();
+ [$_, $prev, $_] = $doc->getDocumentData();
$this->assertCount(
$i,
$prev,
- "With $i words context prev should be $i words long"
+ sprintf('With %d words context prev should be %d words long', $i, $i)
);
for (
- $j=3,$y=$i-1;
- $j>=4-$i;
- $y--,$j--) {
+ $j = 3,$y = $i - 1;
+ $j >= 4 - $i;
+ $y--,$j--
+ ) {
$this->assertEquals(
$this->tokens[$j],
$prev[$y]
@@ -75,21 +83,21 @@ public function testPrevContext()
* until it reaches the edges of the token list. Check the
* next tokens.
*/
- public function testNextContext()
+ public function testNextContext(): void
{
- for ($i=0;$i<5;$i++) {
+ for ($i = 0; $i < 5; $i++) {
$doc = new WordDocument($this->tokens, 4, $i);
- list($_,$_,$next) = $doc->getDocumentData();
+ [$_, $_, $next] = $doc->getDocumentData();
$this->assertCount(
$i,
$next,
- "With $i words context next should be $i words long"
+ sprintf('With %d words context next should be %d words long', $i, $i)
);
- for ($j=5; $j<5+$i; $j++) {
+ for ($j = 5; $j < 5 + $i; $j++) {
$this->assertEquals(
$this->tokens[$j],
- $next[$j-5]
+ $next[$j - 5]
);
}
}
diff --git a/tests/NlpTools/Models/LdaTest.php b/tests/NlpTools/Models/LdaTest.php
index 6ce6a50..7a46039 100644
--- a/tests/NlpTools/Models/LdaTest.php
+++ b/tests/NlpTools/Models/LdaTest.php
@@ -1,5 +1,7 @@
+ */
+ protected array $topics;
- protected function setUp()
+ protected function setUp(): void
{
if (!extension_loaded("gd")) {
$this->markTestSkipped("The gd library is not available");
}
- $this->path = TEST_DATA_DIR."/Models/LdaTest";
+ $this->path = TEST_DATA_DIR . "/Models/LdaTest";
if (!file_exists($this->path)) {
- if (!file_exists(TEST_DATA_DIR."/Models"))
- mkdir(TEST_DATA_DIR."/Models");
+ if (!file_exists(TEST_DATA_DIR . "/Models")) {
+ mkdir(TEST_DATA_DIR . "/Models");
+ }
+
mkdir($this->path);
}
- if (!file_exists("{$this->path}/topics")) {
- mkdir("{$this->path}/topics");
+ if (!file_exists($this->path . '/topics')) {
+ mkdir($this->path . '/topics');
}
+
$this->createTopics();
- if (!file_exists("{$this->path}/data")) {
- mkdir("{$this->path}/data");
+ if (!file_exists($this->path . '/data')) {
+ mkdir($this->path . '/data');
}
- if (count(new \DirectoryIterator("{$this->path}/data"))<502) {
+
+ $fileCount = count(glob($this->path . '/data/*'));
+ if ($fileCount < 502) {
$this->createData();
}
- if (!file_exists("{$this->path}/results")) {
- mkdir("{$this->path}/results");
+ if (!file_exists($this->path . '/results')) {
+ mkdir($this->path . '/results');
}
$this->loadData();
}
- /**
- * @group Slow
- * @group VerySlow
- */
- public function testLda()
+ #[Group('Slow')]
+ #[Group('VerySlow')]
+ public function testLda(): void
{
$lda = new Lda(
new DataAsFeatures(), // feature factory
@@ -67,7 +79,7 @@ public function testLda()
);
$this->assertInstanceOf(
- "NlpTools\Models\Lda",
+ \NlpTools\Models\Lda::class,
$lda
);
@@ -79,24 +91,20 @@ public function testLda()
$lda->initialize($docs);
- for ($i=0;$i<100;$i++) {
+ for ($i = 0; $i < 100; $i++) {
$lda->gibbsSample($docs);
$topics = $lda->getPhi();
- echo $lda->getLogLikelihood(),PHP_EOL;
- foreach ($topics as $t=>$topic) {
- $name = sprintf("{$this->path}/results/topic-%04d-%04d",$i,$t);
+
+ foreach ($topics as $t => $topic) {
+ $name = sprintf($this->path . '/results/topic-%04d-%04d', $i, $t);
$max = max($topic);
$this->createImage(
array_map(
- function ($x) use ($topic,$max) {
- return array_map(
- function ($y) use ($x,$topic,$max) {
- return (int) (($topic[$y*5+$x]/$max)*255);
- },
- range(0,4)
- );
- },
- range(0,4)
+ fn($x): array => array_map(
+ fn($y): int => (int) (($topic[$y * 5 + $x] / $max) * 255),
+ range(0, 4)
+ ),
+ range(0, 4)
),
$name
);
@@ -114,94 +122,18 @@ function ($y) use ($x,$topic,$max) {
//
// TODO: Unit testing for lda is needed
- protected function createTopics()
+ protected function createTopics(): void
{
- $topics = array(
- array(
- array(1,1,1,1,1),
- array(0,0,0,0,0),
- array(0,0,0,0,0),
- array(0,0,0,0,0),
- array(0,0,0,0,0)
- ),
- array(
- array(0,0,0,0,0),
- array(1,1,1,1,1),
- array(0,0,0,0,0),
- array(0,0,0,0,0),
- array(0,0,0,0,0)
- ),
- array(
- array(0,0,0,0,0),
- array(0,0,0,0,0),
- array(1,1,1,1,1),
- array(0,0,0,0,0),
- array(0,0,0,0,0)
- ),
- array(
- array(0,0,0,0,0),
- array(0,0,0,0,0),
- array(0,0,0,0,0),
- array(1,1,1,1,1),
- array(0,0,0,0,0)
- ),
- array(
- array(0,0,0,0,0),
- array(0,0,0,0,0),
- array(0,0,0,0,0),
- array(0,0,0,0,0),
- array(1,1,1,1,1)
- ),
- array(
- array(0,0,0,0,1),
- array(0,0,0,0,1),
- array(0,0,0,0,1),
- array(0,0,0,0,1),
- array(0,0,0,0,1)
- ),
- array(
- array(0,0,0,1,0),
- array(0,0,0,1,0),
- array(0,0,0,1,0),
- array(0,0,0,1,0),
- array(0,0,0,1,0)
- ),
- array(
- array(0,0,1,0,0),
- array(0,0,1,0,0),
- array(0,0,1,0,0),
- array(0,0,1,0,0),
- array(0,0,1,0,0)
- ),
- array(
- array(0,1,0,0,0),
- array(0,1,0,0,0),
- array(0,1,0,0,0),
- array(0,1,0,0,0),
- array(0,1,0,0,0)
- ),
- array(
- array(1,0,0,0,0),
- array(1,0,0,0,0),
- array(1,0,0,0,0),
- array(1,0,0,0,0),
- array(1,0,0,0,0)
- )
- );
+ $topics = [[[1, 1, 1, 1, 1], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0], [1, 1, 1, 1, 1], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [1, 1, 1, 1, 1], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [1, 1, 1, 1, 1], [0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [1, 1, 1, 1, 1]], [[0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]], [[0, 0, 0, 1, 0], [0, 0, 0, 1, 0], [0, 0, 0, 1, 0], [0, 0, 0, 1, 0], [0, 0, 0, 1, 0]], [[0, 0, 1, 0, 0], [0, 0, 1, 0, 0], [0, 0, 1, 0, 0], [0, 0, 1, 0, 0], [0, 0, 1, 0, 0]], [[0, 1, 0, 0, 0], [0, 1, 0, 0, 0], [0, 1, 0, 0, 0], [0, 1, 0, 0, 0], [0, 1, 0, 0, 0]], [[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0, 0]]];
$this->topics = array_map(
- function ($topic) {
- $t = call_user_func_array(
- "array_merge",
- $topic
- );
+ function ($topic): array {
+ $t = array_merge(...$topic);
$s = array_sum($t);
return array_map(
- function ($ti) use ($s) {
- return $ti/$s;
- },
+ fn($ti): int|float => $ti / $s,
$t
);
},
@@ -211,44 +143,39 @@ function ($ti) use ($s) {
// multiply by 255 to make gray-scale images of
// the above arrays
$topics = array_map(
- function ($topic) {
- return array_map(
- function ($row) {
- return array_map(
- function ($pixel) {
- return (int) (255*$pixel);
- },
- $row
- );
- },
- $topic
- );
- },
+ fn($topic): array => array_map(
+ fn($row): array => array_map(
+ fn($pixel): int => (int) (255 * $pixel),
+ $row
+ ),
+ $topic
+ ),
$topics
);
// save them to disk
- foreach ($topics as $key=>$topic) {
- $this->createImage($topic, "{$this->path}/topics/topic-$key");
+ foreach ($topics as $key => $topic) {
+ $this->createImage($topic, sprintf('%s/topics/topic-%s', $this->path, $key));
}
}
- protected function createData()
+ protected function createData(): void
{
- $dir = new Dirichlet(1, count($this->topics));
+ $dirichlet = new Dirichlet(1, count($this->topics));
- for ($i=0;$i<500;$i++) {
- $d = $this->createDocument($this->topics, $dir->sample(), 100);
- $this->createImage($d, "{$this->path}/data/$i");
+ for ($i = 0; $i < 500; $i++) {
+ $d = $this->createDocument($this->topics, $dirichlet->sample(), 100);
+ $this->createImage($d, sprintf('%s/data/%d', $this->path, $i));
}
}
- protected function loadData()
+ protected function loadData(): void
{
$this->tset = new TrainingSet();
- foreach (new \DirectoryIterator("{$this->path}/data") as $f) {
- if ($f->isDir())
+ foreach (new \DirectoryIterator($this->path . '/data') as $f) {
+ if ($f->isDir()) {
continue;
+ }
$this->tset->addDocument(
"",
@@ -261,75 +188,87 @@ protected function loadData()
/**
* Save a two dimensional array as a grey-scale image
+ *
+ * @param array $img
*/
- protected function createImage(array $img,$filename)
+ protected function createImage(array $img, string $filename): void
{
- $im = imagecreate(count($img),count(current($img)));
- imagecolorallocate($im,0,0,0);
- foreach ($img as $y=>$row) {
- foreach ($row as $x=>$color) {
- $color = min(255,max(0,$color));
- $c = imagecolorallocate($im,$color,$color,$color);
- imagesetpixel($im,$x,$y,$c);
+ $im = imagecreate(count($img), count(current($img)));
+ imagecolorallocate($im, 0, 0, 0);
+ foreach ($img as $y => $row) {
+ foreach ($row as $x => $color) {
+ $color = min(255, max(0, $color));
+ $c = imagecolorallocate($im, $color, $color, $color);
+ imagesetpixel($im, $x, $y, $c);
}
}
- imagepng($im,$filename);
+
+ imagepng($im, $filename);
}
/**
* Draw once from a multinomial distribution
+ *
+ * @param array $d
*/
- protected function draw($d)
+ protected function draw(array $d): ?int
{
- $mt = MersenneTwister::get(); // simply mt_rand but in the interval [0,1)
- $x = $mt->generate();
+ $mersenneTwister = MersenneTwister::get(); // simply mt_rand but in the interval [0,1)
+ $x = $mersenneTwister->generate();
$p = 0.0;
- foreach ($d as $i=>$v) {
- $p+=$v;
- if ($p > $x)
+ foreach ($d as $i => $v) {
+ $p += $v;
+ if ($p > $x) {
return $i;
+ }
}
+
+ return null;
}
/**
* Create a document sticking to the model's assumptions
* and hypotheses
+ *
+ * @param array $topicDists
+ * @param array $theta
+ * @return array
*/
- public function createDocument($topic_dists,$theta,$length)
+ public function createDocument(array $topicDists, array $theta, int $length): array
{
- $doc = array_fill_keys(range(0,24),0);
+ $doc = array_fill_keys(range(0, 24), 0);
while ($length-- > 0) {
$topic = $this->draw($theta);
- $word = $this->draw($topic_dists[$topic]);
+ $word = $this->draw($topicDists[$topic]);
$doc[$word] += 1;
}
return array_map(
- function ($start) use ($doc) {
- return array_slice($doc,$start,5);
- },
- range(0,24,5)
+ fn($start): array => array_slice($doc, $start, 5),
+ range(0, 24, 5)
);
}
/**
* Load a document from an image saved to disk
+ *
+ * @return array
*/
- public function fromImg($file)
+ public function fromImg(string $file): array
{
$im = imagecreatefrompng($file);
- $d = array();
- for ($w=0;$w<25;$w++) {
- $x = (int) ($w%5);
- $y = (int) ($w/5);
+ $d = [];
+ for ($w = 0; $w < 25; $w++) {
+ $x = $w % 5;
+ $y = (int) ($w / 5);
- $c = imagecolorsforindex($im,imagecolorat($im,$x,$y));
+ $c = imagecolorsforindex($im, imagecolorat($im, $x, $y));
$c = $c['red'];
- if ($c>0) {
+ if ($c > 0) {
$d = array_merge(
$d,
array_fill_keys(
- range(0,$c-1),
+ range(0, $c - 1),
$w
)
);
@@ -338,5 +277,4 @@ public function fromImg($file)
return $d;
}
-
}
diff --git a/tests/NlpTools/Similarity/CosineSimilarityTest.php b/tests/NlpTools/Similarity/CosineSimilarityTest.php
index 5959b1e..0c1e26c 100644
--- a/tests/NlpTools/Similarity/CosineSimilarityTest.php
+++ b/tests/NlpTools/Similarity/CosineSimilarityTest.php
@@ -1,84 +1,89 @@
assertEquals(
1,
- $sim->similarity($A,$A),
+ (int) $cosineSimilarity->similarity($A, $A),
"The cosine similarity of a set/vector with itsself should be 1"
);
$this->assertEquals(
1,
- $sim->similarity($A,$A_times_2),
+ (int) $cosineSimilarity->similarity($A, $A_times_2),
"The cosine similarity of a vector with a linear combination of itsself should be 1"
);
$this->assertEquals(
0,
- $sim->similarity($A,$B)-$sim->similarity($A_times_2,$B),
+ (int) ($cosineSimilarity->similarity($A, $B) - $cosineSimilarity->similarity($A_times_2, $B)),
"Parallel vectors should have the same angle with any vector B"
);
}
- public function testProducedAngles()
+ public function testProducedAngles(): void
{
- $sim = new CosineSimilarity();
+ $cosineSimilarity = new CosineSimilarity();
- $ba = array(1,1,2,2,2,2); // ba = (2,4)
- $bc = array(1,1,1,2,2); // bc = (3,2)
- $bba = array('a'=>2,'b'=>4);
- $bbc = array('a'=>3,'b'=>2);
- $ba_to_bc = cos(0.5191461142); // approximately 30 deg
+ $ba = [1, 1, 2, 2, 2, 2]; // ba = (2,4)
+ $bc = [1, 1, 1, 2, 2]; // bc = (3,2)
+ $bba = ['a' => 2, 'b' => 4];
+ $bbc = ['a' => 3, 'b' => 2];
+ $ba_to_bc = round(cos(0.5191461142), 8); // approximately 30 deg
$this->assertEquals(
$ba_to_bc,
- $sim->similarity($ba,$bc)
+ round($cosineSimilarity->similarity($ba, $bc), 8)
);
$this->assertEquals(
$ba_to_bc,
- $sim->similarity($bba,$bbc)
+ round($cosineSimilarity->similarity($bba, $bbc), 8)
);
}
- public function testInvalidArgumentException()
+ public function testInvalidArgumentException(): void
{
- $sim = new CosineSimilarity();
- $a = array(1);
- $zero = array();
+ $cosineSimilarity = new CosineSimilarity();
+ $a = [1];
+ $zero = [];
try {
- $sim->similarity(
+ $cosineSimilarity->similarity(
$a,
$zero
);
$this->fail("Cosine similarity with the zero vector should trigger an exception");
- } catch (\InvalidArgumentException $e) {
+ } catch (\InvalidArgumentException $invalidArgumentException) {
$this->assertEquals(
"Vector \$B is the zero vector",
- $e->getMessage()
+ $invalidArgumentException->getMessage()
);
}
+
try {
- $sim->similarity(
+ $cosineSimilarity->similarity(
$zero,
$a
);
$this->fail("Cosine similarity with the zero vector should trigger an exception");
- } catch (\InvalidArgumentException $e) {
+ } catch (\InvalidArgumentException $invalidArgumentException) {
$this->assertEquals(
"Vector \$A is the zero vector",
- $e->getMessage()
+ $invalidArgumentException->getMessage()
);
}
}
diff --git a/tests/NlpTools/Similarity/DiceSimilarityTest.php b/tests/NlpTools/Similarity/DiceSimilarityTest.php
index db22d78..d4d0dfb 100644
--- a/tests/NlpTools/Similarity/DiceSimilarityTest.php
+++ b/tests/NlpTools/Similarity/DiceSimilarityTest.php
@@ -1,32 +1,36 @@
assertEquals(
1,
- $sim->similarity($A,$A),
+ $diceSimilarity->similarity($A, $A),
"The similarity of a set with itsself is 1"
);
$this->assertEquals(
0,
- $sim->similarity($A,$e),
+ $diceSimilarity->similarity($A, $e),
"The similarity of any set with the empty set is 0"
);
$this->assertEquals(
0.75,
- $sim->similarity($A,$B),
+ $diceSimilarity->similarity($A, $B),
"similarity({'my','name','is','john'},{'my','name','is','joe'}) = 0.75"
);
}
diff --git a/tests/NlpTools/Similarity/HammingDistanceTest.php b/tests/NlpTools/Similarity/HammingDistanceTest.php
index f71ca50..22211e9 100644
--- a/tests/NlpTools/Similarity/HammingDistanceTest.php
+++ b/tests/NlpTools/Similarity/HammingDistanceTest.php
@@ -1,27 +1,36 @@
assertEquals(
- max(strlen($A),strlen($B)),
- $dist->dist($A,$B),
+ max(strlen($A), strlen($B)),
+ $hammingDistance->dist($a, $b),
"Two completely dissimilar strings should have distance equal to max(strlen(\$A),strlen(\$B))"
);
$this->assertEquals(
2,
- $dist->dist($C,$D),
+ $hammingDistance->dist($c, $d),
"10101 ~ 11111 have a hamming distance = 2"
);
}
diff --git a/tests/NlpTools/Similarity/JaccardIndexTest.php b/tests/NlpTools/Similarity/JaccardIndexTest.php
index 211c5ea..056b163 100644
--- a/tests/NlpTools/Similarity/JaccardIndexTest.php
+++ b/tests/NlpTools/Similarity/JaccardIndexTest.php
@@ -1,32 +1,36 @@
assertEquals(
1,
- $sim->similarity($A,$A),
+ $jaccardIndex->similarity($A, $A),
"The similarity of a set with itsself is 1"
);
$this->assertEquals(
0,
- $sim->similarity($A,$e),
+ $jaccardIndex->similarity($A, $e),
"The similarity of any set with the empty set is 0"
);
$this->assertEquals(
0.5,
- $sim->similarity($A,$B),
+ $jaccardIndex->similarity($A, $B),
"J({1,2,3},{1,2,3,4,5,6}) = 0.5"
);
}
diff --git a/tests/NlpTools/Similarity/OverlapCoefficientTest.php b/tests/NlpTools/Similarity/OverlapCoefficientTest.php
index 1515960..4e46d00 100644
--- a/tests/NlpTools/Similarity/OverlapCoefficientTest.php
+++ b/tests/NlpTools/Similarity/OverlapCoefficientTest.php
@@ -1,32 +1,36 @@
assertEquals(
1,
- $sim->similarity($A,$A),
+ $overlapCoefficient->similarity($A, $A),
"The similarity of a set with itsself is 1"
);
$this->assertEquals(
0,
- $sim->similarity($A,$e),
+ $overlapCoefficient->similarity($A, $e),
"The similarity of any set with the empty set is 0"
);
$this->assertEquals(
0.5,
- $sim->similarity($A,$B),
+ $overlapCoefficient->similarity($A, $B),
"similarity({'my','name','is','john'},{'your','name','is','joe'}) = 0.5"
);
}
diff --git a/tests/NlpTools/Similarity/SimhashTest.php b/tests/NlpTools/Similarity/SimhashTest.php
index 85c2321..cba7cbf 100644
--- a/tests/NlpTools/Similarity/SimhashTest.php
+++ b/tests/NlpTools/Similarity/SimhashTest.php
@@ -1,41 +1,44 @@
assertEquals(
1,
- $sim->similarity($A,$A),
+ $simhash->similarity($A, $A),
"Two identical sets should have the same hash therefore a similarity of 1"
);
$this->assertGreaterThan(
- $sim->similarity($A,$B),
- $sim->similarity($b,$B),
+ $simhash->similarity($A, $B),
+ $simhash->similarity($b, $B),
"The more elements in common the more similar the two sets should be"
);
}
- public function testWeightedSets()
+ public function testWeightedSets(): void
{
- $sim = new Simhash(64);
+ $simhash = new Simhash(64);
- $A = array("a","a","a","b","b",);
- $B = array("a"=>3,"b"=>2);
+ $A = ["a", "a", "a", "b", "b"];
+ $B = ["a" => 3, "b" => 2];
$this->assertEquals(
1,
- $sim->similarity($A,$B),
+ $simhash->similarity($A, $B),
"The two sets are identical given that one is the weighted version of the other"
);
}
diff --git a/tests/NlpTools/Similarity/TverskyIndexTest.php b/tests/NlpTools/Similarity/TverskyIndexTest.php
index f12f023..92193a2 100644
--- a/tests/NlpTools/Similarity/TverskyIndexTest.php
+++ b/tests/NlpTools/Similarity/TverskyIndexTest.php
@@ -1,47 +1,55 @@
$A
+ * @param array $B
+ */
+ private function sim(array $A, array $B, float $a, int $b): float
{
- $sim = new TverskyIndex($a, $b);
+ $tverskyIndex = new TverskyIndex($a, $b);
- return $sim->similarity($A, $B);
+ return $tverskyIndex->similarity($A, $B);
}
- public function testTverskyIndex()
+ public function testTverskyIndex(): void
{
- $sim = new TverskyIndex();
+ new TverskyIndex();
- $A = array("my","name","is","john");
- $B = array("my","name","is","joe");
- $C = array(1,2,3);
- $D = array(1,2,3,4,5,6);
- $e = array();
+ $A = ["my", "name", "is", "john"];
+ $B = ["my", "name", "is", "joe"];
+ $C = [1, 2, 3];
+ $D = [1, 2, 3, 4, 5, 6];
+ $e = [];
$this->assertEquals(
1,
- $this->sim($A,$A, 0.5, 1),
+ $this->sim($A, $A, 0.5, 1),
"The similarity of a set with itsself is 1"
);
$this->assertEquals(
0,
- $this->sim($A,$e, 0.5, 2),
+ $this->sim($A, $e, 0.5, 2),
"The similarity of any set with the empty set is 0"
);
$this->assertEquals(
0.75,
- $this->sim($A,$B, 0.5, 1),
+ $this->sim($A, $B, 0.5, 1),
"similarity({'my','name','is','john'},{'my','name','is','joe'}) = 0.75"
);
$this->assertEquals(
0.5,
- $this->sim($C,$D, 0.5, 2),
+ $this->sim($C, $D, 0.5, 2),
"similarity({1,2,3},{1,2,3,4,5,6}) = 0.5"
);
}
diff --git a/tests/NlpTools/Stemmers/GreekStemmerTest.php b/tests/NlpTools/Stemmers/GreekStemmerTest.php
index cf040a3..3e511f4 100644
--- a/tests/NlpTools/Stemmers/GreekStemmerTest.php
+++ b/tests/NlpTools/Stemmers/GreekStemmerTest.php
@@ -1,7 +1,12 @@
setFlags(\SplFileObject::DROP_NEW_LINE | \SplFileObject::SKIP_EMPTY);
$stems->setFlags(\SplFileObject::DROP_NEW_LINE | \SplFileObject::SKIP_EMPTY);
$stems->rewind();
- $stemmer = new GreekStemmer();
- $this->checkStemmer($stemmer, $words, $stems);
+ $greekStemmer = new GreekStemmer();
+ $this->checkStemmer($greekStemmer, $words, $stems);
}
}
diff --git a/tests/NlpTools/Stemmers/LancasterStemmerTest.php b/tests/NlpTools/Stemmers/LancasterStemmerTest.php
index 68908de..321589e 100644
--- a/tests/NlpTools/Stemmers/LancasterStemmerTest.php
+++ b/tests/NlpTools/Stemmers/LancasterStemmerTest.php
@@ -1,36 +1,40 @@
assertEquals('maxim', $stemmer->stem('maximum'));
- $this->assertEquals('presum', $stemmer->stem('presumably'));
- $this->assertEquals('multiply', $stemmer->stem('multiply'));
- $this->assertEquals('provid', $stemmer->stem('provision'));
- $this->assertEquals('ow', $stemmer->stem('owed'));
- $this->assertEquals('ear', $stemmer->stem('ear'));
- $this->assertEquals('say', $stemmer->stem('saying'));
- $this->assertEquals('cry', $stemmer->stem('crying'));
- $this->assertEquals('string', $stemmer->stem('string'));
- $this->assertEquals('meant', $stemmer->stem('meant'));
- $this->assertEquals('cem', $stemmer->stem('cement'));
+ $lancasterStemmer = new LancasterStemmer();
+ $this->assertEquals('maxim', $lancasterStemmer->stem('maximum'));
+ $this->assertEquals('presum', $lancasterStemmer->stem('presumably'));
+ $this->assertEquals('multiply', $lancasterStemmer->stem('multiply'));
+ $this->assertEquals('provid', $lancasterStemmer->stem('provision'));
+ $this->assertEquals('ow', $lancasterStemmer->stem('owed'));
+ $this->assertEquals('ear', $lancasterStemmer->stem('ear'));
+ $this->assertEquals('say', $lancasterStemmer->stem('saying'));
+ $this->assertEquals('cry', $lancasterStemmer->stem('crying'));
+ $this->assertEquals('string', $lancasterStemmer->stem('string'));
+ $this->assertEquals('meant', $lancasterStemmer->stem('meant'));
+ $this->assertEquals('cem', $lancasterStemmer->stem('cement'));
}
/**
* Added to cover issue #34
*/
- public function testEmptyStringForWord()
+ public function testEmptyStringForWord(): void
{
- $stemmer = new LancasterStemmer();
- $this->assertEquals("", $stemmer->stem(""));
+ $lancasterStemmer = new LancasterStemmer();
+ $this->assertEquals("", $lancasterStemmer->stem(""));
}
}
-
diff --git a/tests/NlpTools/Stemmers/PorterStemmerTest.php b/tests/NlpTools/Stemmers/PorterStemmerTest.php
index e9e387f..ebec365 100644
--- a/tests/NlpTools/Stemmers/PorterStemmerTest.php
+++ b/tests/NlpTools/Stemmers/PorterStemmerTest.php
@@ -1,7 +1,12 @@
setFlags(\SplFileObject::DROP_NEW_LINE | \SplFileObject::SKIP_EMPTY);
$stems->setFlags(\SplFileObject::DROP_NEW_LINE | \SplFileObject::SKIP_EMPTY);
$stems->rewind();
- $stemmer = new PorterStemmer();
- $this->checkStemmer($stemmer, $words, $stems);
+ $porterStemmer = new PorterStemmer();
+ $this->checkStemmer($porterStemmer, $words, $stems);
}
}
diff --git a/tests/NlpTools/Stemmers/StemmerTestBase.php b/tests/NlpTools/Stemmers/StemmerTestBase.php
index 1c7bd22..458ced1 100644
--- a/tests/NlpTools/Stemmers/StemmerTestBase.php
+++ b/tests/NlpTools/Stemmers/StemmerTestBase.php
@@ -1,22 +1,30 @@
current();
$this->assertEquals(
- $stemmer->stem($word),
$stem,
- "The stem for '$word' should be '$stem' not '{$stemmer->stem($word)}'"
+ $stemmer->stem($word),
+ sprintf("The stem for '%s' should be '%s' not '%s'", $word, $stem, $stemmer->stem($word))
);
$stems->next();
}
diff --git a/tests/NlpTools/Stemmers/TransformationTest.php b/tests/NlpTools/Stemmers/TransformationTest.php
index 3a03e29..2746ef7 100644
--- a/tests/NlpTools/Stemmers/TransformationTest.php
+++ b/tests/NlpTools/Stemmers/TransformationTest.php
@@ -1,37 +1,42 @@
+ */
+ public static function provideStemmers(): array
{
- return array(
- array(new LancasterStemmer()),
- array(new PorterStemmer())
- );
+ return [
+ 'LancasterStemmer' => [new LancasterStemmer()],
+ 'PorterStemmer' => [new PorterStemmer()]
+ ];
}
- /**
- * @dataProvider provideStemmers
- */
- public function testStemmer(Stemmer $stemmer)
+ #[DataProvider('provideStemmers')]
+ public function testStemmer(Stemmer $stemmer): void
{
- $tokens = explode(" ","this renowned monster who had come off victorious in a hundred fights with his pursuers was an old bull whale of prodigious size and strength from the effect of age or more probably from a freak of nature a singular consequence had resulted he was white as wool");
+ $tokens = explode(" ", "this renowned monster who had come off victorious in a hundred fights with his pursuers was an old bull whale of prodigious size and strength from the effect of age or more probably from a freak of nature a singular consequence had resulted he was white as wool");
$stemmed = $stemmer->stemAll($tokens);
- $doc = new TokensDocument($tokens);
+ $tokensDocument = new TokensDocument($tokens);
$this->assertNotEquals(
$stemmed,
- $doc->getDocumentData()
+ $tokensDocument->getDocumentData()
);
- $doc->applyTransformation($stemmer);
+ $tokensDocument->applyTransformation($stemmer);
$this->assertEquals(
$stemmed,
- $doc->getDocumentData()
+ $tokensDocument->getDocumentData()
);
}
}
diff --git a/tests/NlpTools/Tokenizers/ClassifierBasedTokenizerTest.php b/tests/NlpTools/Tokenizers/ClassifierBasedTokenizerTest.php
index d02ec35..e55ef9d 100644
--- a/tests/NlpTools/Tokenizers/ClassifierBasedTokenizerTest.php
+++ b/tests/NlpTools/Tokenizers/ClassifierBasedTokenizerTest.php
@@ -1,14 +1,17 @@
assertEquals(
- array(
- "We are what we repeatedly do.",
- "Excellence, then, is not an act, but a habit."
- ),
- $tok->tokenize($text)
+ ["We are what we repeatedly do.", "Excellence, then, is not an act, but a habit."],
+ $classifierBasedTokenizer->tokenize($text)
);
}
}
diff --git a/tests/NlpTools/Tokenizers/PennTreeBankTokenizerTest.php b/tests/NlpTools/Tokenizers/PennTreeBankTokenizerTest.php
index c8daf0d..6f24b6e 100644
--- a/tests/NlpTools/Tokenizers/PennTreeBankTokenizerTest.php
+++ b/tests/NlpTools/Tokenizers/PennTreeBankTokenizerTest.php
@@ -1,54 +1,56 @@
tokenize("Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.");
+ $pennTreeBankTokenizer = new PennTreeBankTokenizer();
+ $tokens = $pennTreeBankTokenizer->tokenize("Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.");
$this->assertCount(16, $tokens);
}
- public function testTokenizer2()
+ public function testTokenizer2(): void
{
- $tokenizer = new PennTreeBankTokenizer();
- $this->assertCount(7, $tokenizer->tokenize("They'll save and invest more."));
+ $pennTreeBankTokenizer = new PennTreeBankTokenizer();
+ $this->assertCount(7, $pennTreeBankTokenizer->tokenize("They'll save and invest more."));
}
-
- public function testTokenizer3()
+
+ public function testTokenizer3(): void
{
- $tokenizer = new PennTreeBankTokenizer();
- $this->assertCount(4, $tokenizer->tokenize("I'm some text"));
+ $pennTreeBankTokenizer = new PennTreeBankTokenizer();
+ $this->assertCount(4, $pennTreeBankTokenizer->tokenize("I'm some text"));
}
-
- public function testAgainstOriginalSedImplementation()
+
+ public function testAgainstOriginalSedImplementation(): void
{
- $tokenizer = new PennTreeBankTokenizer();
- $tokenized = new \SplFileObject(TEST_DATA_DIR."/Tokenizers/PennTreeBankTokenizerTest/tokenized");
+ $pennTreeBankTokenizer = new PennTreeBankTokenizer();
+ $tokenized = new \SplFileObject(TEST_DATA_DIR . "/Tokenizers/PennTreeBankTokenizerTest/tokenized");
$tokenized->setFlags(\SplFileObject::DROP_NEW_LINE);
- $sentences = new \SplFileObject(TEST_DATA_DIR."/Tokenizers/PennTreeBankTokenizerTest/test.txt");
+
+ $sentences = new \SplFileObject(TEST_DATA_DIR . "/Tokenizers/PennTreeBankTokenizerTest/test.txt");
$sentences->setFlags(\SplFileObject::DROP_NEW_LINE);
-
+
$tokenized->rewind();
foreach ($sentences as $sentence) {
- if ($sentence) // skip empty lines
- {
+ if ($sentence) { // skip empty lines
$this->assertEquals(
$tokenized->current(),
- implode(" ",$tokenizer->tokenize($sentence)),
- "Sentence: '$sentence' was not tokenized correctly"
+ implode(" ", $pennTreeBankTokenizer->tokenize($sentence)),
+ sprintf("Sentence: '%s' was not tokenized correctly", $sentence)
);
}
+
$tokenized->next();
}
-
}
-
}
diff --git a/tests/NlpTools/Tokenizers/RegexTokenizerTest.php b/tests/NlpTools/Tokenizers/RegexTokenizerTest.php
index f751395..6ff84ef 100644
--- a/tests/NlpTools/Tokenizers/RegexTokenizerTest.php
+++ b/tests/NlpTools/Tokenizers/RegexTokenizerTest.php
@@ -1,86 +1,82 @@
tokenize("0 1 2 3 4 5 6 7 8 9");
$this->assertCount(10, $tokens);
- $this->assertEquals("0123456789",implode("",$tokens));
+ $this->assertEquals("0123456789", implode("", $tokens));
// check split2
- $tok = new RegexTokenizer(array(
- "/\n+/"
- ));
+ $tok = new RegexTokenizer(["/\n+/"]);
$tokens = $tok->tokenize("0 1 2 3 4\n5 6 7 8 9");
$this->assertCount(2, $tokens);
- $this->assertEquals("0 1 2 3 45 6 7 8 9",implode("",$tokens));
+ $this->assertEquals("0 1 2 3 45 6 7 8 9", implode("", $tokens));
$tokens = $tok->tokenize("0 1 2 3 4\n\n5 6 7 8 9");
$this->assertCount(2, $tokens);
- $this->assertEquals("0 1 2 3 45 6 7 8 9",implode("",$tokens));
-
+ $this->assertEquals("0 1 2 3 45 6 7 8 9", implode("", $tokens));
}
/**
* Test a pattern that captures instead of splits
*/
- public function testMatches()
+ public function testMatches(): void
{
// check keep matches
- $tok = new RegexTokenizer(array(
- array("/(\s+)?(\w+)(\s+)?/",2)
- ));
+ $regexTokenizer = new RegexTokenizer([["/(\s+)?(\w+)(\s+)?/", 2]]);
- $tokens = $tok->tokenize("0 1 2 3 4 5 6 7 8 9");
+ $tokens = $regexTokenizer->tokenize("0 1 2 3 4 5 6 7 8 9");
$this->assertCount(10, $tokens);
- $this->assertEquals("0123456789",implode("",$tokens));
+ $this->assertEquals("0123456789", implode("", $tokens));
}
/**
* Test a pattern that firsts replaces all digits with themselves separated
* by a space and then tokenizes on whitespace.
*/
- public function testReplace()
+ public function testReplace(): void
{
// check keep matches
- $tok = new RegexTokenizer(array(
- array("/\d/",'$0 '),
- WhitespaceTokenizer::PATTERN
- ));
+ $regexTokenizer = new RegexTokenizer([["/\d/", '$0 '], WhitespaceTokenizer::PATTERN]);
- $tokens = $tok->tokenize("0123456789");
+ $tokens = $regexTokenizer->tokenize("0123456789");
$this->assertCount(10, $tokens);
- $this->assertEquals("0123456789",implode("",$tokens));
+ $this->assertEquals("0123456789", implode("", $tokens));
}
/**
* Test a simple pattern meant to split the full stop from the last
* word of a sentence.
*/
- public function testSplitWithManyPatterns()
+ public function testSplitWithManyPatterns(): void
{
- $tok = new RegexTokenizer(array(
- WhitespaceTokenizer::PATTERN, // split on whitespace
- array("/([^\.])\.$/",'$1 .'), // replace . with .
- "/ /" // split on
- ));
+ $regexTokenizer = new RegexTokenizer([
+ WhitespaceTokenizer::PATTERN,
+ // split on whitespace
+ ["/([^\.])\.$/", '$1 .'],
+ // replace . with .
+ "/ /",
+ ]);
// example text stolen from NLTK :-)
$str = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks.";
- $tokens = $tok->tokenize($str);
+ $tokens = $regexTokenizer->tokenize($str);
$this->assertCount(17, $tokens);
$this->assertEquals($tokens[3], "$3.88");
$this->assertEquals($tokens[7], ".");
diff --git a/tests/NlpTools/Tokenizers/WhitespaceAndPunctuationTokenizerTest.php b/tests/NlpTools/Tokenizers/WhitespaceAndPunctuationTokenizerTest.php
new file mode 100644
index 0000000..9eeedf1
--- /dev/null
+++ b/tests/NlpTools/Tokenizers/WhitespaceAndPunctuationTokenizerTest.php
@@ -0,0 +1,47 @@
+assertEquals(
+ $tokens,
+ $whitespaceAndPunctuationTokenizer->tokenize($s)
+ );
+ }
+
+ public function testTokenizerOnUtf8(): void
+ {
+ $whitespaceAndPunctuationTokenizer = new WhitespaceAndPunctuationTokenizer();
+
+ $s = "Ελληνικό κείμενο για παράδειγμα utf-8 χαρακτήρων";
+ $tokens = ['Ελληνικό', 'κείμενο', 'για', 'παράδειγμα', 'utf', '-', '8', 'χαρακτήρων'];
+ // test tokenization of multibyte non-whitespace characters
+ $this->assertEquals(
+ $tokens,
+ $whitespaceAndPunctuationTokenizer->tokenize($s)
+ );
+
+ $s = "Here exists non-breaking space ";
+ $tokens = ['Here', 'exists', 'non', '-', 'breaking', 'space'];
+ // test tokenization of multibyte whitespace
+ $this->assertEquals(
+ $tokens,
+ $whitespaceAndPunctuationTokenizer->tokenize($s)
+ );
+ }
+}
diff --git a/tests/NlpTools/Tokenizers/WhitespaceAndPuntuationTokenizerTest.php b/tests/NlpTools/Tokenizers/WhitespaceAndPuntuationTokenizerTest.php
deleted file mode 100644
index 2a8f46b..0000000
--- a/tests/NlpTools/Tokenizers/WhitespaceAndPuntuationTokenizerTest.php
+++ /dev/null
@@ -1,44 +0,0 @@
-assertEquals(
- $tokens,
- $tok->tokenize($s)
- );
- }
-
- public function testTokenizerOnUtf8()
- {
- $tok = new WhitespaceAndPunctuationTokenizer();
-
- $s = "Ελληνικό κείμενο για παράδειγμα utf-8 χαρακτήρων";
- $tokens = array('Ελληνικό','κείμενο','για','παράδειγμα','utf','-','8','χαρακτήρων');
- // test tokenization of multibyte non-whitespace characters
- $this->assertEquals(
- $tokens,
- $tok->tokenize($s)
- );
-
- $s = "Here exists non-breaking space ";
- $tokens = array('Here','exists','non','-','breaking','space');
- // test tokenization of multibyte whitespace
- $this->assertEquals(
- $tokens,
- $tok->tokenize($s)
- );
- }
-}
diff --git a/tests/NlpTools/Tokenizers/WhitespaceTokenizerTest.php b/tests/NlpTools/Tokenizers/WhitespaceTokenizerTest.php
index 824d14e..8b416d3 100644
--- a/tests/NlpTools/Tokenizers/WhitespaceTokenizerTest.php
+++ b/tests/NlpTools/Tokenizers/WhitespaceTokenizerTest.php
@@ -1,44 +1,46 @@
assertEquals(
$tokens,
- $tok->tokenize($s)
+ $whitespaceTokenizer->tokenize($s)
);
}
- public function testTokenizerOnUtf8()
+ public function testTokenizerOnUtf8(): void
{
- $tok = new WhitespaceTokenizer();
+ $whitespaceTokenizer = new WhitespaceTokenizer();
$s = "Ελληνικό κείμενο για παράδειγμα utf-8 χαρακτήρων";
- $tokens = array('Ελληνικό','κείμενο','για','παράδειγμα','utf-8','χαρακτήρων');
+ $tokens = ['Ελληνικό', 'κείμενο', 'για', 'παράδειγμα', 'utf-8', 'χαρακτήρων'];
// test tokenization of multibyte non-whitespace characters
$this->assertEquals(
$tokens,
- $tok->tokenize($s)
+ $whitespaceTokenizer->tokenize($s)
);
$s = "Here exists non-breaking space ";
- $tokens = array('Here','exists','non-breaking','space');
+ $tokens = ['Here', 'exists', 'non-breaking', 'space'];
// test tokenization of multibyte whitespace
$this->assertEquals(
$tokens,
- $tok->tokenize($s)
+ $whitespaceTokenizer->tokenize($s)
);
}
}
diff --git a/tests/NlpTools/Utils/ClassifierBasedTransformationTest.php b/tests/NlpTools/Utils/ClassifierBasedTransformationTest.php
index 8801faa..fe60296 100644
--- a/tests/NlpTools/Utils/ClassifierBasedTransformationTest.php
+++ b/tests/NlpTools/Utils/ClassifierBasedTransformationTest.php
@@ -1,39 +1,46 @@
$classes
+ */
+ public function classify(array $classes, DocumentInterface $document): string
{
- return $classes[$d->getDocumentData() % count($classes)];
+ return $classes[$document->getDocumentData() % count($classes)];
}
- public function testEvenAndOdd()
+ public function testEvenAndOdd(): void
{
- $stubEven = $this->getMock("NlpTools\\Utils\\TransformationInterface");
+ $stubEven = $this->createMock(TransformationInterface::class);
$stubEven->expects($this->any())
->method('transform')
- ->will($this->returnValue('even'));
- $stubOdd = $this->getMock("NlpTools\\Utils\\TransformationInterface");
+ ->willReturn('even');
+ $stubOdd = $this->createMock(TransformationInterface::class);
$stubOdd->expects($this->any())
->method('transform')
- ->will($this->returnValue('odd'));
+ ->willReturn('odd');
- $transform = new ClassifierBasedTransformation($this);
- $transform->register("even", $stubEven);
- $transform->register("odd", $stubOdd);
+ $classifierBasedTransformation = new ClassifierBasedTransformation($this);
+ $classifierBasedTransformation->register("even", $stubEven);
+ $classifierBasedTransformation->register("odd", $stubOdd);
$this->assertEquals(
"odd",
- $transform->transform(3)
+ $classifierBasedTransformation->transform('3')
);
$this->assertEquals(
"even",
- $transform->transform(4)
+ $classifierBasedTransformation->transform('4')
);
}
}
diff --git a/tests/NlpTools/Utils/EnglishVowelsTest.php b/tests/NlpTools/Utils/EnglishVowelsTest.php
index a3e6690..5f42452 100644
--- a/tests/NlpTools/Utils/EnglishVowelsTest.php
+++ b/tests/NlpTools/Utils/EnglishVowelsTest.php
@@ -1,23 +1,26 @@
assertTrue($vowelChecker->isVowel("man", 1));
+ public function testIsVowel(): void
+ {
+ $vowelsAbstractFactory = VowelsAbstractFactory::factory("English");
+ $this->assertTrue($vowelsAbstractFactory->isVowel("man", 1));
}
-
- public function testYIsVowel()
+
+ public function testYIsVowel(): void
{
- $vowelChecker = VowelsAbstractFactory::factory("English");
- $this->assertTrue($vowelChecker->isVowel("try", 2));
+ $vowelsAbstractFactory = VowelsAbstractFactory::factory("English");
+ $this->assertTrue($vowelsAbstractFactory->isVowel("try", 2));
}
}
-
-
diff --git a/tests/NlpTools/Utils/IdentityTransformer.php b/tests/NlpTools/Utils/IdentityTransformer.php
index df48bd3..e3f02ed 100644
--- a/tests/NlpTools/Utils/IdentityTransformer.php
+++ b/tests/NlpTools/Utils/IdentityTransformer.php
@@ -1,5 +1,7 @@
assertEquals(
- explode(" ","ο μορφωμενοσ διαφερει απο τον αμορφωτο οσο ο ζωντανοσ απο τον νεκρο"),
+ explode(" ", "ο μορφωμενοσ διαφερει απο τον αμορφωτο οσο ο ζωντανοσ απο τον νεκρο"),
$greek->normalizeAll(
- explode(" ","Ο μορφωμένος διαφέρει από τον αμόρφωτο όσο ο ζωντανός από τον νεκρό")
+ explode(" ", "Ο μορφωμένος διαφέρει από τον αμόρφωτο όσο ο ζωντανός από τον νεκρό")
)
);
$this->assertEquals(
- explode(" ","ο μορφωμένος διαφέρει από τον αμόρφωτο όσο ο ζωντανός από τον νεκρό"),
- $english->normalizeAll(
- explode(" ","Ο μορφωμένος διαφέρει από τον αμόρφωτο όσο ο ζωντανός από τον νεκρό")
+ explode(" ", "ο μορφωμένος διαφέρει από τον αμόρφωτο όσο ο ζωντανός από τον νεκρό"),
+ $normalizer->normalizeAll(
+ explode(" ", "Ο μορφωμένος διαφέρει από τον αμόρφωτο όσο ο ζωντανός από τον νεκρό")
)
);
$this->assertEquals(
- explode(" ","when a father gives to his son both laugh when a son gives to his father both cry" ),
- $english->normalizeAll(
- explode(" ","When a father gives to his son both laugh when a son gives to his father both cry" )
+ explode(" ", "when a father gives to his son both laugh when a son gives to his father both cry"),
+ $normalizer->normalizeAll(
+ explode(" ", "When a father gives to his son both laugh when a son gives to his father both cry")
)
);
}
diff --git a/tests/NlpTools/Utils/StopWordsTest.php b/tests/NlpTools/Utils/StopWordsTest.php
index e18fcf3..4a40831 100644
--- a/tests/NlpTools/Utils/StopWordsTest.php
+++ b/tests/NlpTools/Utils/StopWordsTest.php
@@ -1,48 +1,41 @@
applyTransformation($stopwords);
+ $tokensDocument = new TokensDocument(explode(" ", "if you tell the truth you do not have to remember anything"));
+ $tokensDocument->applyTransformation($stopwords);
$this->assertEquals(
- array(
- "if", "you", "tell", "truth", "you", "do", "not", "have", "remember", "anything"
- ),
- $doc->getDocumentData()
+ ["if", "you", "tell", "truth", "you", "do", "not", "have", "remember", "anything"],
+ $tokensDocument->getDocumentData()
);
}
- public function testStopwordsWithTransformation()
+ public function testStopwordsWithTransformation(): void
{
$stopwords = new StopWords(
- array(
- "to",
- "the"
- ),
+ ["to", "the"],
Normalizer::factory("English")
);
- $doc = new TokensDocument(explode(" ", "If you Tell The truth You do not have To remember Anything"));
- $doc->applyTransformation($stopwords);
+ $tokensDocument = new TokensDocument(explode(" ", "If you Tell The truth You do not have To remember Anything"));
+ $tokensDocument->applyTransformation($stopwords);
$this->assertEquals(
- array(
- "If", "you", "Tell", "truth", "You", "do", "not", "have", "remember", "Anything"
- ),
- $doc->getDocumentData()
+ ["If", "you", "Tell", "truth", "You", "do", "not", "have", "remember", "Anything"],
+ $tokensDocument->getDocumentData()
);
}
}
diff --git a/tests/README.markdown b/tests/README.markdown
deleted file mode 100644
index c112a60..0000000
--- a/tests/README.markdown
+++ /dev/null
@@ -1,26 +0,0 @@
-Testing information
-===================
-
-This readme contains a bit of information regarding writing tests for NlpTools and executing them.
-
-Writing Tests
--------------
-
-* Test classes should be in the same namespace as the class that is being tested
-* Any data needed for the test or produced by the test should be in the 'data' directory
- under the same folder as the namespace. Only data needed (not produced) are commited to
- the repository.
-* Tests should be marked with the groups **Slow** and **VerySlow** if they require more than
- 10 seconds and 1 minute respectively. If a test is marked as VerySlow it should also be marked
- as Slow.
-* Both functional and unit tests are welcome.
-
-Executing Tests
----------------
-
-Currently only one testsuite is defined (all tests). Because some tests take a long time to
-run you can try running `phpunit --exclude-group Slow` or `phpunit --exclude-group VerySlow`
-to avoid some slow tests.
-
-PHPUnit should be run from inside the tests folder or the phpunit.xml file should be provided
-as config.
diff --git a/tests/bootstrap.php b/tests/bootstrap.php
index 94f23fe..5177769 100644
--- a/tests/bootstrap.php
+++ b/tests/bootstrap.php
@@ -1,27 +1,31 @@
-
- ./NlpTools/
-
-