namehash · Byczong · Feb 5, 2025 · Feb 5, 2025 · Feb 6, 2025 · Feb 6, 2025
@@ -37,7 +37,11 @@ jobs:
 
       - name: Install dependencies
         working-directory: ./apps/api.nameai.io
-        run: poetry install --extras "lambda"
+        run: poetry install --extras "lambda" --with dev
+
+      - name: Download required data files
+        working-directory: ./apps/api.nameai.io
+        run: poetry run python -m nameai.download
 
       - name: Run tests
         working-directory: ./apps/api.nameai.io

@@ -18,4 +18,4 @@ ALCHEMY_URI_SEPOLIA=https://eth-sepolia.g.alchemy.com/v2/[YOUR_ALCHEMY_API_KEY]
 # - https://discuss.ens.domains/t/ens-subgraph-migration-to-the-decentralised-version/19183
 # - https://thegraph.com/explorer/subgraphs/5XqPmWe6gjyrJtFn9cLy237i4cWw2j9HcUJEXsP5qGtH?view=Query&chain=arbitrum-one
 ENS_SUBGRAPH_URL_MAINNET=https://api.thegraph.com/subgraphs/name/ensdomains/ens
-ENS_SUBGRAPH_URL_SEPOLIA=https://api.studio.thegraph.com/query/49574/enssepolia/version/latest
+ENS_SUBGRAPH_URL_SEPOLIA=https://api.studio.thegraph.com/query/49574/enssepolia/version/latest
@@ -4,4 +4,5 @@ RUN yum install gcc -y
 COPY pyproject.toml poetry.lock LICENSE README.md ./
 COPY nameai ./nameai/
 RUN pip install --no-cache-dir .[lambda]
+RUN python -m nameai.download
 CMD [ "nameai.root_api.handler" ]
@@ -69,19 +69,25 @@ def dfs(self, index, result, gap_before=False):
 
 
 class AllTokenizer:
-    """Return all tokenizations. It is a generator."""
+    """
+    General-purpose tokenizer that finds all possible word combinations in text.
+
+    Uses an Aho-Corasick automaton with multiple dictionaries to identify
+    valid words. Can produce tokenizations with gaps.
+    Yields tokenizations as tuples of tokens.
+    """
 
     def __init__(self, config):
         self.config = config
-        self.skip_non_words = config.tokenization.skip_non_words
-        self.with_gaps = config.tokenization.with_gaps
+        self.skip_non_words = config.tokenization.all_tokenizer.skip_non_words
+        self.with_gaps = config.tokenization.all_tokenizer.with_gaps
 
     @static_property
     def automaton(self):
         automaton = ahocorasick.Automaton()
 
         should_be_tokenized = set()
-        with open(get_resource_path(self.config.tokenization.should_be_tokenized), encoding='utf-8') as f:
+        with open(get_resource_path(self.config.tokenization.all_tokenizer.should_be_tokenized), encoding='utf-8') as f:
             for line in f:
                 word = line.strip().lower()
                 should_be_tokenized.add(word)

@@ -4,12 +4,26 @@ tokenization:
   dictionary: words.txt
   custom_dictionary: custom_dictionary.txt
   domain_specific_dictionary: domain_specific_dictionary.txt
-  should_be_tokenized: should_be_tokenized.txt
-  skip_non_words: false
-  with_gaps: true
+  all_tokenizer:
+    should_be_tokenized: should_be_tokenized.txt
+    skip_non_words: false
+    with_gaps: true
+  person_names:
+    first_names: pn_firstnames.json
+    last_names: pn_lastnames.json
+    other: pn_other.json
+    country_stats: pn_country_stats.json
+    country_bonus: 100
 ngrams:
   unigrams: unigram_freq.csv
   bigrams: bigram_freq.csv
   custom_dictionary: custom_dictionary.txt
   domain_specific_dictionary: domain_specific_dictionary.txt
   custom_token_frequency: 500000
+s3_resources:
+  bucket: prod-name-generator-namegeneratori-inputss3bucket-c26jqo3twfxy
+  person_names:
+    first_names_key: person_names_firstnames.json
+    last_names_key: person_names_lastnames.json
+    other_key: person_names_other.json
+    country_stats_key: person_names_country_stats.json
@@ -0,0 +1,55 @@
+{
+    "simple_names": {
+        "piotrwiśniewski": ["piotr", "wiśniewski"],
+        "camilleclaudel": ["camille", "claudel"],
+        "johnnydepp": ["johnny", "depp"],
+        "giancarloesposito": ["giancarlo", "esposito"],
+        "maríagarcía": ["maría", "garcía"],
+        "viktororbán": ["viktor", "orbán"],
+        "sørenkierkegaard": ["søren", "kierkegaard"],
+        "oceanneguérin": ["oceanne", "guérin"]
+    },
+    "ambiguous_names": {
+        "dragonfernandez": {
+            "person_name": ["dragon", "fernandez"],
+            "words": ["dragon", "fern", "a", "ndez"]
+        },
+        "wolfsmith": {
+            "person_name": ["wolf", "smith"],
+            "words": ["wolf", "smith"]
+        },
+        "blacksmith": {
+            "person_name": null,
+            "words": ["black", "smith"]
+        },
+        "charleswood": {
+            "person_name": ["charles", "wood"],
+            "words": ["char", "les", "wood"]
+        },
+        "maylin": {
+            "person_name": ["may", "lin"],
+            "words": ["may", "lin"]
+        },
+        "natalieportman": {
+            "person_name": ["natalie", "portman"],
+            "words": ["nat", "alie", "port", "man"]
+        },
+        "sunyoung": {
+            "person_name": ["sunyoung"],
+            "words": ["suny", "oung"]
+        },
+        "annalísa": {
+            "person_name": ["anna", "lísa"],
+            "words": ["ann", "alísa"]
+        }
+    },
+    "non_names": {
+        "dragonfernouch": ["dragon", "fern", "ouch"],
+        "webmaster": ["webmaster"],
+        "quantumleap": ["quantum", "leap"],
+        "neuralnet": ["neural", "net"],
+        "deepmatrix": ["deep", "matrix"],
+        "cloudsync": ["cloud", "sync"],
+        "byteflow": ["byte", "flow"]
+    }
+}
@@ -0,0 +1,53 @@
+import boto3
+import botocore
+from omegaconf import DictConfig
+import hydra
+import os
+
+from nameai.data import get_resource_path
+
+
+class S3Downloader:
+    def __init__(self, bucket: str):
+        self.s3_client = None
+        self.bucket = bucket
+        self.region_name = 'us-east-1'
+
+    def get_client(self):
+        if self.s3_client is None:
+            self.s3_client = boto3.client(
+                's3', region_name=self.region_name, config=botocore.config.Config(signature_version=botocore.UNSIGNED)
+            )
+        return self.s3_client
+
+    def download_file(self, key: str, local_path: str, overwrite: bool = False):
+        if os.path.exists(local_path) and not overwrite:
+            return
+        self.get_client().download_file(self.bucket, key, local_path)
+
+
+def download_files(config: DictConfig):
+    """Download files using provided config"""
+    downloader = S3Downloader(config.s3_resources.bucket)
+    files_to_download = [
+        (config.s3_resources.person_names.first_names_key, config.tokenization.person_names.first_names),
+        (config.s3_resources.person_names.last_names_key, config.tokenization.person_names.last_names),
+        (config.s3_resources.person_names.other_key, config.tokenization.person_names.other),
+        (config.s3_resources.person_names.country_stats_key, config.tokenization.person_names.country_stats),
+    ]
+
+    for s3_key, local_path in files_to_download:
+        downloader.download_file(
+            key=s3_key,
+            local_path=get_resource_path(local_path),
+            overwrite=True,
+        )
+
+
+@hydra.main(config_path='./config', config_name='prod_config', version_base=None)
+def download_files_main(config: DictConfig):
+    download_files(config)
+
+
+if __name__ == '__main__':
+    download_files_main()
@@ -82,13 +82,13 @@ def all_unigrams_count(self) -> int:
     def all_bigrams_count(self) -> int:
         return self._bigrams_and_count[1]
 
-    def unigram_count(self, word: str) -> int:
+    def unigram_count(self, word: str) -> int | float:
         return self.unigrams.get(word, self.oov_count(word))
 
     def bigram_count(self, word: str) -> Optional[int]:
         return self.bigrams.get(word, None)
 
-    def oov_count(self, word: str) -> int:
+    def oov_count(self, word: str) -> float:
         return (1 / 100) ** (len(word))
 
     def word_probability(self, word: str) -> float:

@@ -10,6 +10,7 @@
 )
 from nameai.all_tokenizer import AllTokenizer
 from nameai.ngrams import Ngrams
+from nameai.person_names import PersonNameTokenizer
 
 
 def init_inspector():
@@ -49,6 +50,7 @@ class NLPInspector:
     def __init__(self, config):
         self.inspector = init_inspector()
         self.tokenizer = AllTokenizer(config)
+        self.person_names_tokenizer = PersonNameTokenizer(config)
         self.ngrams = Ngrams(config)
 
     def nlp_analyse_label(self, label: str) -> NLPLabelAnalysis:
@@ -92,28 +94,52 @@ def base_analyse_label(self, label: str):
         return self.inspector.analyse_label(label, simple_confusables=True)
 
     def tokenize(self, label: str, tokenizations_limit: int) -> tuple[list[dict], bool]:
-        tokenizeds_iterator = self.tokenizer.tokenize(label)
+        """
+        Tokenize text using both person name and general-purpose tokenizers.
+
+        Combines results from PersonNameTokenizer (with name-specific probabilities)
+        and AllTokenizer (with ngram-based probabilities).
+        Returns tokenizations sorted by probability.
+        """
+        all_tokenizer_iterator = self.tokenizer.tokenize(label)
+        person_names_iterator = self.person_names_tokenizer.tokenize_with_scores(label)
+
         tokenizeds = []
         partial_tokenization = False
         try:
             used = set()
             i = 0
-            for tokenized in tokenizeds_iterator:
+
+            # first add person name tokenizations with their original scores
+            for tokenized, log_prob in person_names_iterator:
+                if tokenized not in used:
+                    if i == tokenizations_limit:
+                        partial_tokenization = True
+                        break
+                    used.add(tokenized)
+                    i += 1
+                    tokenizeds.append({'tokens': tokenized, 'log_probability': log_prob, 'source': 'person_names'})
+
+            # then add regular tokenizations
+            for tokenized in all_tokenizer_iterator:
                 if tokenized not in used:
                     if i == tokenizations_limit:
                         partial_tokenization = True
                         break
                     used.add(tokenized)
                     i += 1
-                    tokenizeds.append(tokenized)
+                    # for AllTokenizer tokenizations, use ngrams probability
+                    tokenizeds.append(
+                        {
+                            'tokens': tokenized,
+                            'log_probability': self.ngrams.sequence_log_probability(tokenized),
+                            'source': 'ngrams',
+                        }
+                    )
+
         except RecursionError:
             partial_tokenization = True
 
-        tokenizeds = [
-            {'tokens': tokenized, 'log_probability': self.ngrams.sequence_log_probability(tokenized)}
-            for tokenized in tokenizeds
-        ]
-
         for tokenized in tokenizeds:
             tokenized['tokens'] = tuple(uniq_gaps(tokenized['tokens']))
             tokenized['probability'] = math.exp(tokenized['log_probability'])