Moved some methods from SpamModelTrainer to separate classes

mikoweb · mikoweb · commit 215a7416ed01 · 2024-05-28T17:50:24.000+02:00
diff --git a/src/Module/ML/Application/Model/LearnerFactory.php b/src/Module/ML/Application/Model/LearnerFactory.php
@@ -0,0 +1,55 @@
+<?php
+
+namespace App\Module\ML\Application\Model;
+
+use App\Module\ML\Domain\Constant;
+use Rubix\ML\Classifiers\ClassificationTree;
+use Rubix\ML\Classifiers\RandomForest;
+use Rubix\ML\Learner;
+use Rubix\ML\Pipeline;
+use Rubix\ML\Tokenizers\WordStemmer;
+use Rubix\ML\Transformers\MultibyteTextNormalizer;
+use Rubix\ML\Transformers\StopWordFilter;
+use Rubix\ML\Transformers\TfIdfTransformer;
+use Rubix\ML\Transformers\WordCountVectorizer;
+use Rubix\ML\Transformers\ZScaleStandardizer;
+
+/**
+ * @see https://docs.rubixml.com/latest/classifiers/random-forest.html
+ * @see https://docs.rubixml.com/latest/tokenizers/word-stemmer.html
+ * @see https://docs.rubixml.com/latest/transformers/word-count-vectorizer.html
+ * @see https://docs.rubixml.com/latest/transformers/stop-word-filter.html
+ * @see https://docs.rubixml.com/latest/transformers/tf-idf-transformer.html
+ * @see https://docs.rubixml.com/latest/transformers/z-scale-standardizer.html
+ */
+class LearnerFactory
+{
+    public static function createLearner(
+        int $uniqueWordsNum,
+        int $minDocumentCount = Constant::DEFAULT_MIN_DOCUMENT_COUNT,
+        float $maxDocumentRatio = Constant::DEFAULT_MAX_DOCUMENT_RATIO,
+        string $language = Constant::DEFAULT_LANGUAGE,
+        int $treeMaxHeight = PHP_INT_MAX,
+        int $treeEstimators = Constant::DEFAULT_TREE_ESTIMATORS,
+        float $treeRatio = Constant::DEFAULT_TREE_RATIO,
+        bool $treeBalanced = Constant::DEFAULT_TREE_BALANCED,
+    ): Learner {
+        return new Pipeline([
+            new MultibyteTextNormalizer(),
+            new StopWordFilter(Constant::STOP_WORDS),
+            new WordCountVectorizer(
+                maxVocabularySize: $uniqueWordsNum,
+                minDocumentCount: $minDocumentCount,
+                maxDocumentRatio: $maxDocumentRatio,
+                tokenizer: new WordStemmer($language),
+            ),
+            new TfIdfTransformer(),
+            new ZScaleStandardizer(),
+        ], new RandomForest(
+            new ClassificationTree($treeMaxHeight),
+            estimators: $treeEstimators,
+            ratio: $treeRatio,
+            balanced: $treeBalanced,
+        ));
+    }
+}
diff --git a/src/Module/ML/Application/Model/SpamModelTrainer.php b/src/Module/ML/Application/Model/SpamModelTrainer.php
@@ -5,32 +5,14 @@
 use App\Core\Application\Path\AppPathResolver;
 use App\Core\Infrastructure\Bus\CommandBusInterface;
 use App\Module\ML\Application\Interaction\Command\SaveSpamDataset\SaveSpamDatasetCommand;
+use App\Module\ML\Application\Utils\WordsUtils;
 use App\Module\ML\Domain\Constant;
-use Rubix\ML\Classifiers\ClassificationTree;
-use Rubix\ML\Classifiers\RandomForest;
 use Rubix\ML\Datasets\Labeled;
 use Rubix\ML\Extractors\CSV;
 use Rubix\ML\PersistentModel;
 use Rubix\ML\Persisters\Filesystem;
-use Rubix\ML\Pipeline;
-use Rubix\ML\Tokenizers\WordStemmer;
-use Rubix\ML\Transformers\MultibyteTextNormalizer;
-use Rubix\ML\Transformers\StopWordFilter;
-use Rubix\ML\Transformers\TfIdfTransformer;
-use Rubix\ML\Transformers\WordCountVectorizer;
-use Rubix\ML\Transformers\ZScaleStandardizer;
 use Symfony\Component\Console\Style\SymfonyStyle;
 
-use function Symfony\Component\String\u;
-
-/**
- * @see https://docs.rubixml.com/latest/classifiers/random-forest.html
- * @see https://docs.rubixml.com/latest/tokenizers/word-stemmer.html
- * @see https://docs.rubixml.com/latest/transformers/word-count-vectorizer.html
- * @see https://docs.rubixml.com/latest/transformers/stop-word-filter.html
- * @see https://docs.rubixml.com/latest/transformers/tf-idf-transformer.html
- * @see https://docs.rubixml.com/latest/transformers/z-scale-standardizer.html
- */
 class SpamModelTrainer
 {
     private static ?SymfonyStyle $io = null;
@@ -72,24 +54,19 @@ public function train(
         self::$io?->info(sprintf('The training dataset contains `%d` samples.', $training->numSamples()));
 
         $modelPath = $this->appPathResolver->getModelPath($outputModelFilename);
+        $uniqueWordsNum = WordsUtils::countUniqueWords($dataset->samples(), $minWordsCount);
+
         $estimator = new PersistentModel(
-            new Pipeline([
-                new MultibyteTextNormalizer(),
-                new StopWordFilter(Constant::STOP_WORDS),
-                new WordCountVectorizer(
-                    maxVocabularySize: $this->countUniqueWords($dataset->samples(), $minWordsCount),
-                    minDocumentCount: $minDocumentCount,
-                    maxDocumentRatio: $maxDocumentRatio,
-                    tokenizer: new WordStemmer($language),
-                ),
-                new TfIdfTransformer(),
-                new ZScaleStandardizer(),
-            ], new RandomForest(
-                new ClassificationTree($treeMaxHeight),
-                estimators: $treeEstimators,
-                ratio: $treeRatio,
-                balanced: $treeBalanced,
-            )),
+            LearnerFactory::createLearner(
+                uniqueWordsNum: $uniqueWordsNum,
+                minDocumentCount: $minDocumentCount,
+                maxDocumentRatio: $maxDocumentRatio,
+                language: $language,
+                treeMaxHeight: $treeMaxHeight,
+                treeEstimators: $treeEstimators,
+                treeRatio: $treeRatio,
+                treeBalanced: $treeBalanced,
+            ),
             new Filesystem($modelPath, $history)
         );
 
@@ -124,31 +101,4 @@ private function saveDataset(
 
         self::$io?->info(sprintf('Saved dataset `%s`.', $outputDatasetFilename));
     }
-
-    /**
-     * @param array<string[]> $samples
-     */
-    private function countUniqueWords(array $samples, int $minCount): int
-    {
-        $words = [];
-
-        foreach ($samples as $sample) {
-            $items = array_filter(
-                preg_split('/\s/', $sample[0]),
-                fn (string $word) => !empty($word)
-            );
-
-            foreach ($items as $item) {
-                $word = u($item)->snake()->toString();
-
-                if (!isset($words[$word])) {
-                    $words[$word] = 1;
-                } else {
-                    ++$words[$word];
-                }
-            }
-        }
-
-        return count(array_filter($words, fn (int $count) => $count >= $minCount));
-    }
 }
diff --git a/src/Module/ML/Application/Utils/WordsUtils.php b/src/Module/ML/Application/Utils/WordsUtils.php
@@ -0,0 +1,35 @@
+<?php
+
+namespace App\Module\ML\Application\Utils;
+
+use function Symfony\Component\String\u;
+
+class WordsUtils
+{
+    /**
+     * @param array<string[]> $samples
+     */
+    public static function countUniqueWords(array $samples, int $minCount): int
+    {
+        $words = [];
+
+        foreach ($samples as $sample) {
+            $items = array_filter(
+                preg_split('/\s/', $sample[0]),
+                fn (string $word) => !empty($word)
+            );
+
+            foreach ($items as $item) {
+                $word = u($item)->snake()->toString();
+
+                if (!isset($words[$word])) {
+                    $words[$word] = 1;
+                } else {
+                    ++$words[$word];
+                }
+            }
+        }
+
+        return count(array_filter($words, fn (int $count) => $count >= $minCount));
+    }
+}