Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/workflows/nameai-python-unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,11 @@ jobs:

- name: Install dependencies
working-directory: ./apps/api.nameai.io
run: poetry install --extras "lambda"
run: poetry install --extras "lambda" --with dev

- name: Download required data files
working-directory: ./apps/api.nameai.io
run: poetry run python -m nameai.download

- name: Run tests
working-directory: ./apps/api.nameai.io
Expand Down
2 changes: 1 addition & 1 deletion apps/api.nameai.io/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ ALCHEMY_URI_SEPOLIA=https://eth-sepolia.g.alchemy.com/v2/[YOUR_ALCHEMY_API_KEY]
# - https://discuss.ens.domains/t/ens-subgraph-migration-to-the-decentralised-version/19183
# - https://thegraph.com/explorer/subgraphs/5XqPmWe6gjyrJtFn9cLy237i4cWw2j9HcUJEXsP5qGtH?view=Query&chain=arbitrum-one
ENS_SUBGRAPH_URL_MAINNET=https://api.thegraph.com/subgraphs/name/ensdomains/ens
ENS_SUBGRAPH_URL_SEPOLIA=https://api.studio.thegraph.com/query/49574/enssepolia/version/latest
ENS_SUBGRAPH_URL_SEPOLIA=https://api.studio.thegraph.com/query/49574/enssepolia/version/latest
1 change: 1 addition & 0 deletions apps/api.nameai.io/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ RUN yum install gcc -y
COPY pyproject.toml poetry.lock LICENSE README.md ./
COPY nameai ./nameai/
RUN pip install --no-cache-dir .[lambda]
RUN python -m nameai.download
CMD [ "nameai.root_api.handler" ]
14 changes: 10 additions & 4 deletions apps/api.nameai.io/nameai/all_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,19 +69,25 @@ def dfs(self, index, result, gap_before=False):


class AllTokenizer:
"""Return all tokenizations. It is a generator."""
"""
General-purpose tokenizer that finds all possible word combinations in text.

Uses an Aho-Corasick automaton with multiple dictionaries to identify
valid words. Can produce tokenizations with gaps.
Yields tokenizations as tuples of tokens.
"""

def __init__(self, config):
self.config = config
self.skip_non_words = config.tokenization.skip_non_words
self.with_gaps = config.tokenization.with_gaps
self.skip_non_words = config.tokenization.all_tokenizer.skip_non_words
self.with_gaps = config.tokenization.all_tokenizer.with_gaps

@static_property
def automaton(self):
automaton = ahocorasick.Automaton()

should_be_tokenized = set()
with open(get_resource_path(self.config.tokenization.should_be_tokenized), encoding='utf-8') as f:
with open(get_resource_path(self.config.tokenization.all_tokenizer.should_be_tokenized), encoding='utf-8') as f:
for line in f:
word = line.strip().lower()
should_be_tokenized.add(word)
Expand Down
20 changes: 17 additions & 3 deletions apps/api.nameai.io/nameai/config/prod_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,26 @@ tokenization:
dictionary: words.txt
custom_dictionary: custom_dictionary.txt
domain_specific_dictionary: domain_specific_dictionary.txt
should_be_tokenized: should_be_tokenized.txt
skip_non_words: false
with_gaps: true
all_tokenizer:
should_be_tokenized: should_be_tokenized.txt
skip_non_words: false
with_gaps: true
person_names:
first_names: pn_firstnames.json
last_names: pn_lastnames.json
other: pn_other.json
country_stats: pn_country_stats.json
country_bonus: 100
ngrams:
unigrams: unigram_freq.csv
bigrams: bigram_freq.csv
custom_dictionary: custom_dictionary.txt
domain_specific_dictionary: domain_specific_dictionary.txt
custom_token_frequency: 500000
s3_resources:
bucket: prod-name-generator-namegeneratori-inputss3bucket-c26jqo3twfxy
person_names:
first_names_key: person_names_firstnames.json
last_names_key: person_names_lastnames.json
other_key: person_names_other.json
country_stats_key: person_names_country_stats.json
55 changes: 55 additions & 0 deletions apps/api.nameai.io/nameai/data/tests/person_names_quality.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
{
"simple_names": {
"piotrwiśniewski": ["piotr", "wiśniewski"],
"camilleclaudel": ["camille", "claudel"],
"johnnydepp": ["johnny", "depp"],
"giancarloesposito": ["giancarlo", "esposito"],
"maríagarcía": ["maría", "garcía"],
"viktororbán": ["viktor", "orbán"],
"sørenkierkegaard": ["søren", "kierkegaard"],
"oceanneguérin": ["oceanne", "guérin"]
},
"ambiguous_names": {
"dragonfernandez": {
"person_name": ["dragon", "fernandez"],
"words": ["dragon", "fern", "a", "ndez"]
},
"wolfsmith": {
"person_name": ["wolf", "smith"],
"words": ["wolf", "smith"]
},
"blacksmith": {
"person_name": null,
"words": ["black", "smith"]
},
"charleswood": {
"person_name": ["charles", "wood"],
"words": ["char", "les", "wood"]
},
"maylin": {
"person_name": ["may", "lin"],
"words": ["may", "lin"]
},
"natalieportman": {
"person_name": ["natalie", "portman"],
"words": ["nat", "alie", "port", "man"]
},
"sunyoung": {
"person_name": ["sunyoung"],
"words": ["suny", "oung"]
},
"annalísa": {
"person_name": ["anna", "lísa"],
"words": ["ann", "alísa"]
}
},
"non_names": {
"dragonfernouch": ["dragon", "fern", "ouch"],
"webmaster": ["webmaster"],
"quantumleap": ["quantum", "leap"],
"neuralnet": ["neural", "net"],
"deepmatrix": ["deep", "matrix"],
"cloudsync": ["cloud", "sync"],
"byteflow": ["byte", "flow"]
}
}
53 changes: 53 additions & 0 deletions apps/api.nameai.io/nameai/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import boto3
import botocore
from omegaconf import DictConfig
import hydra
import os

from nameai.data import get_resource_path


class S3Downloader:
def __init__(self, bucket: str):
self.s3_client = None
self.bucket = bucket
self.region_name = 'us-east-1'

def get_client(self):
if self.s3_client is None:
self.s3_client = boto3.client(
's3', region_name=self.region_name, config=botocore.config.Config(signature_version=botocore.UNSIGNED)
)
return self.s3_client

def download_file(self, key: str, local_path: str, overwrite: bool = False):
if os.path.exists(local_path) and not overwrite:
return
self.get_client().download_file(self.bucket, key, local_path)


def download_files(config: DictConfig):
"""Download files using provided config"""
downloader = S3Downloader(config.s3_resources.bucket)
files_to_download = [
(config.s3_resources.person_names.first_names_key, config.tokenization.person_names.first_names),
(config.s3_resources.person_names.last_names_key, config.tokenization.person_names.last_names),
(config.s3_resources.person_names.other_key, config.tokenization.person_names.other),
(config.s3_resources.person_names.country_stats_key, config.tokenization.person_names.country_stats),
]

for s3_key, local_path in files_to_download:
downloader.download_file(
key=s3_key,
local_path=get_resource_path(local_path),
overwrite=True,
)


@hydra.main(config_path='./config', config_name='prod_config', version_base=None)
def download_files_main(config: DictConfig):
download_files(config)


if __name__ == '__main__':
download_files_main()
4 changes: 2 additions & 2 deletions apps/api.nameai.io/nameai/ngrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,13 @@ def all_unigrams_count(self) -> int:
def all_bigrams_count(self) -> int:
return self._bigrams_and_count[1]

def unigram_count(self, word: str) -> int:
def unigram_count(self, word: str) -> int | float:
return self.unigrams.get(word, self.oov_count(word))

def bigram_count(self, word: str) -> Optional[int]:
return self.bigrams.get(word, None)

def oov_count(self, word: str) -> int:
def oov_count(self, word: str) -> float:
return (1 / 100) ** (len(word))

def word_probability(self, word: str) -> float:
Expand Down
42 changes: 34 additions & 8 deletions apps/api.nameai.io/nameai/nlp_inspector.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
)
from nameai.all_tokenizer import AllTokenizer
from nameai.ngrams import Ngrams
from nameai.person_names import PersonNameTokenizer


def init_inspector():
Expand Down Expand Up @@ -49,6 +50,7 @@ class NLPInspector:
def __init__(self, config):
self.inspector = init_inspector()
self.tokenizer = AllTokenizer(config)
self.person_names_tokenizer = PersonNameTokenizer(config)
self.ngrams = Ngrams(config)

def nlp_analyse_label(self, label: str) -> NLPLabelAnalysis:
Expand Down Expand Up @@ -92,28 +94,52 @@ def base_analyse_label(self, label: str):
return self.inspector.analyse_label(label, simple_confusables=True)

def tokenize(self, label: str, tokenizations_limit: int) -> tuple[list[dict], bool]:
tokenizeds_iterator = self.tokenizer.tokenize(label)
"""
Tokenize text using both person name and general-purpose tokenizers.

Combines results from PersonNameTokenizer (with name-specific probabilities)
and AllTokenizer (with ngram-based probabilities).
Returns tokenizations sorted by probability.
"""
all_tokenizer_iterator = self.tokenizer.tokenize(label)
person_names_iterator = self.person_names_tokenizer.tokenize_with_scores(label)

tokenizeds = []
partial_tokenization = False
try:
used = set()
i = 0
for tokenized in tokenizeds_iterator:

# first add person name tokenizations with their original scores
for tokenized, log_prob in person_names_iterator:
if tokenized not in used:
if i == tokenizations_limit:
partial_tokenization = True
break
used.add(tokenized)
i += 1
tokenizeds.append({'tokens': tokenized, 'log_probability': log_prob, 'source': 'person_names'})

# then add regular tokenizations
for tokenized in all_tokenizer_iterator:
if tokenized not in used:
if i == tokenizations_limit:
partial_tokenization = True
break
used.add(tokenized)
i += 1
tokenizeds.append(tokenized)
# for AllTokenizer tokenizations, use ngrams probability
tokenizeds.append(
{
'tokens': tokenized,
'log_probability': self.ngrams.sequence_log_probability(tokenized),
'source': 'ngrams',
}
)

except RecursionError:
partial_tokenization = True

tokenizeds = [
{'tokens': tokenized, 'log_probability': self.ngrams.sequence_log_probability(tokenized)}
for tokenized in tokenizeds
]

for tokenized in tokenizeds:
tokenized['tokens'] = tuple(uniq_gaps(tokenized['tokens']))
tokenized['probability'] = math.exp(tokenized['log_probability'])
Expand Down
Loading
Loading