From 91a684257315870e6433fffecdec563748dc6dcc Mon Sep 17 00:00:00 2001 From: Mykola Haltiuk Date: Thu, 20 Feb 2025 06:16:57 +0100 Subject: [PATCH 1/3] reworked namegraph readme --- docs/media/domain-detail.dark.svg | 2 + docs/media/domain-detail.light.svg | 2 + docs/media/full.dark.svg | 31 +++ docs/media/full.light.svg | 31 +++ docs/media/generators.dark.svg | 4 + docs/media/generators.light.svg | 4 + docs/media/instant.dark.svg | 2 + docs/media/instant.light.svg | 2 + docs/media/logo-light.svg | 13 + docs/old_readme.md | 202 ++++++++++++++ readme.md | 427 ++++++++++++++++++----------- 11 files changed, 562 insertions(+), 158 deletions(-) create mode 100644 docs/media/domain-detail.dark.svg create mode 100644 docs/media/domain-detail.light.svg create mode 100644 docs/media/full.dark.svg create mode 100644 docs/media/full.light.svg create mode 100644 docs/media/generators.dark.svg create mode 100644 docs/media/generators.light.svg create mode 100644 docs/media/instant.dark.svg create mode 100644 docs/media/instant.light.svg create mode 100644 docs/media/logo-light.svg create mode 100644 docs/old_readme.md diff --git a/docs/media/domain-detail.dark.svg b/docs/media/domain-detail.dark.svg new file mode 100644 index 00000000..80301a14 --- /dev/null +++ b/docs/media/domain-detail.dark.svg @@ -0,0 +1,2 @@ + + diff --git a/docs/media/domain-detail.light.svg b/docs/media/domain-detail.light.svg new file mode 100644 index 00000000..4130271f --- /dev/null +++ b/docs/media/domain-detail.light.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/docs/media/full.dark.svg b/docs/media/full.dark.svg new file mode 100644 index 00000000..950f7cae --- /dev/null +++ b/docs/media/full.dark.svg @@ -0,0 +1,31 @@ + + + + + + + + + + diff --git a/docs/media/full.light.svg b/docs/media/full.light.svg new file mode 100644 index 00000000..5ba534aa --- /dev/null +++ b/docs/media/full.light.svg @@ -0,0 +1,31 @@ + + + + + + + + + + \ No newline at end of file diff --git a/docs/media/generators.dark.svg b/docs/media/generators.dark.svg new file mode 100644 index 00000000..077f7772 --- /dev/null +++ b/docs/media/generators.dark.svg @@ -0,0 +1,4 @@ + + + +

Wordplay

Alternates

Emojify

Expand

Community

Go Wild

Other

Suffix

PersonNameExpand

Prefix

Rhymes

Emoji

Keycap

FlagAffix

Symbol

PersonNameEmojify

Permute

Hyphen

Abbreviation

Wikipedia2V

W2V

SubstringMatch

RandomAvailableName

OnSaleMatch

Categories

Leet

Reverse

EasterEgg

SpecialCharacterAffix

WordnetSynonyms

\ No newline at end of file diff --git a/docs/media/generators.light.svg b/docs/media/generators.light.svg new file mode 100644 index 00000000..023566f2 --- /dev/null +++ b/docs/media/generators.light.svg @@ -0,0 +1,4 @@ + + + +

Wordplay

Alternates

Emojify

Expand

Community

Go Wild

Other

Suffix

PersonNameExpand

Prefix

Rhymes

Emoji

Keycap

FlagAffix

Symbol

PersonNameEmojify

Permute

Hyphen

Abbreviation

Wikipedia2V

W2V

SubstringMatch

RandomAvailableName

OnSaleMatch

Categories

Leet

Reverse

EasterEgg

SpecialCharacterAffix

WordnetSynonyms

\ No newline at end of file diff --git a/docs/media/instant.dark.svg b/docs/media/instant.dark.svg new file mode 100644 index 00000000..65841eac --- /dev/null +++ b/docs/media/instant.dark.svg @@ -0,0 +1,2 @@ + + diff --git a/docs/media/instant.light.svg b/docs/media/instant.light.svg new file mode 100644 index 00000000..016c202c --- /dev/null +++ b/docs/media/instant.light.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/docs/media/logo-light.svg b/docs/media/logo-light.svg new file mode 100644 index 00000000..36dee215 --- /dev/null +++ b/docs/media/logo-light.svg @@ -0,0 +1,13 @@ + + + + NameGraph + + + + + + beta + + + diff --git a/docs/old_readme.md b/docs/old_readme.md new file mode 100644 index 00000000..a6066f4c --- /dev/null +++ b/docs/old_readme.md @@ -0,0 +1,202 @@ +![Tests](https://github.com/namehash/namegraph/actions/workflows/ci.yml/badge.svg?branch=master) + +# Install + +``` +pip3 install -e . +``` + +# Usage + +The application can be run: +* reading queries from stdin +* reading query as an argument + +Additional resources need to be downloaded: +``` +python download.py # dictionaries, embeddings +python download_names.py +``` + +## Queries from stdin + +``` +python ./generator/app.py app.input=stdin +``` + +## Query as an argument + +The application can be run with + +``` +python ./generator/app.py +``` + +It will generate suggestions for the default query. + +The default parameters are defined in `conf/config.yaml`. Any of the parameters might be substituted with a path to the +parameter, with dot-separated fragments, e.g. + +``` +python ./generator/app.py app.query=firepower +``` + +will substitute the default query with the provided one. + +The parameters are documented in the config. + +# REST API + +Start server: +``` +python -m uvicorn web_api:app --reload +``` + +Query with POST: +``` +curl -d '{"label":"fire"}' -H "Content-Type: application/json" -X POST http://localhost:8000 +``` + +# Tests + +Run: +``` +pytest +``` +or without slow tests: +``` +pytest -m "not slow" +``` + +## Debugging + +Run app with `app.logging_level=DEBUG` to see debug information: +``` +python generator/app.py app.input=stdin app.logging_level=DEBUG +``` + +# Deployment + +## Build Docker image locally + +Set image TAG: + +`export TAG=0.1.0` + +Build a Docker image locally + +`docker compose -f docker-compose.build.yml build` + +Authorize to Amazon (if you are using MFA you have to take temporary ACCESS keys from AWS STS): + +`aws configure` + +Authorize to ECR: + +`./authorize-ecr.sh` + +Push image to ECR: + +`docker push 571094861812.dkr.ecr.us-east-1.amazonaws.com/name-generator:${TAG} + +## Deploy image on remote instance + +Set image TAG: + +`export TAG=0.1.0 + +Authorize EC2 instance in ECR: + +`aws ecr get-login-password | docker login --username AWS --password-stdin 571094861812.dkr.ecr.us-east-1.amazonaws.com/name-generator` + +(Re-Deploy) image: + +`docker compose up -d` + +Check if service works: + +`curl -d '{"label":"firestarter"}' -H "Content-Type: application/json" -X POST http://44.203.61.202` + +## Learning-To-Rank + +To access the LTR features, you need to configure it in the Elasticsearch instance (see [here](https://github.com/namehash/collection-templates/tree/master/research/learning-to-rank/readme.md) for more details). + +## Pipelines, weights, sampler + +In `conf/prod_config_new.yaml` are defined `generator_limits` which limits maximum number of suggestions generated by each generator. This is for optimization. E.g.: +```yaml + generator_limits: + HyphenGenerator: 128 + AbbreviationGenerator: 128 + EmojiGenerator: 150 + Wikipedia2VGenerator: 100 + RandomAvailableNameGenerator: 20000 +``` + +In `conf/pipelines/prod_new.yaml` are defined pipelines. Each pipeline have: +* a `name` +* one `generator` +* list of `filters`, e.g. SubnameFilter, ValidNameFilter, ValidNameLengthFilter, DomainFilter +* `weights` for each interpretation type (`ngram`, `person`, `other`) and each language +* `mode_weights_multiplier` - a multiplier of above weights for each mode (e.g. `instant`, `domain_detail`, `full`) +* `global_limits` for each mode, which can be integer (absolute number) or float (percentage of all results); also you can override values for `grouped_by_category` endpoint by adding prefix `grouped_` (e.g. `grouped_instant`, `grouped_domain_detail`, `grouped_full`) + +Setting `0` in `mode_weights_multiplier` or `global_limits` disables the pipeline in a given mode. + +### Sampler + +Each request defines: +* `mode` +* `min_suggestions` +* `max_suggestions` +* `min_available_fraction` + +A name can have many interpretations. Every interpretation has type (`ngram`, `person`, `other`) and language. Every interpretation have a probability. There might be more than one interpretation with the same type and language. + +For each pair of type and language, probabilities of each pipeline are computed. + +1. If there is enough suggestions then break. +2. If all pipeline probabilities for every pair of type nad language are 0 then break. +3. Sample type and language, then sample interpretation within this type and language. +4. Sample a pipeline for the sampled interpretation. The first pass of sampling is without replacement to increase diversity in top suggestions. +5. If the pipeline exceeds its global limit then go to 4. +6. Get a suggestion from the pipeline. (The generator is executed here). If there is no more suggestions then go to 4. +7. If the suggestion have been already sampled then go to 6. +8. If the suggestion is not available and there is room only for available then go to 6. +9. If the suggestion is not normalized then go to 6. +10. Go to 1. + +Exhausted pipelines are removed from sampling. + +### Grouped by category + +Parameters: +* `mode` +* `min_available_fraction` +* max number of categories +* max number of suggestions per category +* max related categories +* min total categories? +* max total categories? + +Requirements: +* order of categories is fixed +* every generator must be mapped to only one category +* flag generator suggestion should appear in 10% of suggestions - maybe we should detect if it is first search by a user + * should we remove first pass of sampling with every generator? + +1. Shuffle order of categories (using weights?) if min number of categories is smaller than all categories. If some category does not return suggestions then we take the next one. +2. Within each category: sample type and lang of interpretation, sample interpretaion with this type and lang. Sample pipeline (weights of pipelines depends on type and language. Do it in parallel? +3. Sample `max number of suggestions per category`. How handle `min_available_fraction`? + +### Suggestions by category + +For each category MetaSampler is created with appropriate pipelines. +In parallel, all MetaSamplers are exectuted. In one MetaSampler: +1. Apply global limits. +2. For each interpretation (interpretation_type, lang, tokenization) a sampler is created. + + +After generation of suggestions for all categories: +1. For each category number of suggestions is limited by category's `max_suggestions`. +2. If `count_real_suggestions` < `min_total_suggestions` then RandomAvailable names are appended as `other` category. diff --git a/readme.md b/readme.md index a6066f4c..2bd831c0 100644 --- a/readme.md +++ b/readme.md @@ -1,202 +1,313 @@ -![Tests](https://github.com/namehash/namegraph/actions/workflows/ci.yml/badge.svg?branch=master) -# Install + -``` -pip3 install -e . -``` +
+
-# Usage + -The application can be run: -* reading queries from stdin -* reading query as an argument +

+ + + + NameGraph + + +

-Additional resources need to be downloaded: -``` -python download.py # dictionaries, embeddings -python download_names.py -``` + +

+ Surf more than 21 million name ideas across more than 400,000 name collections,
or generate infinite related name suggestions. +

-## Queries from stdin - -``` -python ./generator/app.py app.input=stdin + +

+ + + + + + Python Build Status + + + + + + Python CI Status + + + + + + status: beta + + +

+ +# Project Status + +NameGraph is currently in beta. We are excited to share our work with you and continue to build the greatest web of names in history! + +# Overview + +NameGraph is a web service that generates name suggestions for a given input label. It is implemented using FastAPI and provides a variety of endpoints to generate suggestions in different modes and with different parameters. + +## Label Analysis + +The input label is analyzed to determine the most relevant name suggestions. The analysis includes: + +- Defining all possible interpretations of the input label along with their probabilities (whether it is a sequence of common words, a person name, what is the language, etc.) +- For each interpretation, determining most probable tokenizations (e.g. `armstrong` -> `["armstrong"]`, `armstrong` -> `["arm", "strong"]`) + +The suggestions are later generated based on these interpretations, tokenizations being especially important, since many generators greatly rely on them. This is why the endpoints can handle pretokenized input. + +## Collections + +Collections are curated sets of names that serve as a core component of NameGraph's name suggestion system. The system maintains a vast database of over 400,000 name collections containing more than 21 million unique names. Each collection is stored in Elasticsearch and contains: + +- A unique collection ID +- Collection title and description +- Collection rank and metadata +- Member names with their normalized and tokenized forms +- Collection types and categories +- Related collections + +Collections are used in several key ways: + +1. Direct Name Generation: + - Searches collections based on input tokens + - Uses [learning-to-rank models](#learning-to-rank) to find relevant collections + +2. Related Collections: + - Finds collections with similar themes and content + - Ensures diverse suggestions across different categories + +3. Membership Lookup: + - Discovers collections containing specific names + - Enables finding thematically related names + +The collections are maintained and updated through our [NameGraph Collections](https://github.com/namehash/namegraph-collections) project, ensuring the suggestion database stays current and comprehensive. + +## Generators + +Generators are core components that create name suggestions through different methods. Each generator inherits from the base [NameGenerator](namegraph/generation/name_generator.py) class and implements specific name generation strategies. They can be grouped into the categories as shown in the diagram below: + +

+ + + NameKit + +

+ +## Modes + +NameGraph supports three modes for processing requests: + +- Instant Mode (`instant`): + - Fastest response time + - More basic name generations + - Some advanced generators like W2VGenerator are disabled (weight multiplier = 0) + - Often used for real-time suggestions + +- Domain Detail Mode (`domain_detail`): + - Intermediate between instant and full + - More comprehensive than instant, but still optimized for performance + - Some generators have reduced weights compared to full mode + - Expanded search window for collection ranking and sampling + +- Full Mode (`full`): + - Most comprehensive name generation + - Includes all enabled generators + - Uses full weights for most generators + - Accesses advanced generators like `Wikipedia2VGenerator` and `W2VGenerator` + - Takes longer to process, but provides the most diverse results + + +Different generators are enabled/disabled for each mode. Take a look at the [generators diagram](#generators) to see which generators are available in each mode. + +
+ + + + + + + + + + + + + + + + + + + + + +
IconModeDescription
InstantInstantFastest response, basic generators only
Domain DetailDomain DetailBalanced speed/quality, expanded search
FullFullComprehensive generation with all generators
+
+ +## Sampler + +The sampler is a sophisticated component that manages the selection and generation of name suggestions. It implements a probabilistic sampling algorithm that balances diversity, relevance, and efficiency while respecting various constraints. + +### Key Components + +- **Request Parameters**: + - `mode`: Determines which generators are active (`instant`/`domain_detail`/`full`) + - `min_suggestions`: Minimum number of suggestions to return + - `max_suggestions`: Maximum number of suggestions to return + - `min_available_fraction`: Minimum fraction of suggestions that must be available + +- **Interpretations**: Each input name can have multiple interpretations, characterized by: + - Type (`ngram`, `person`, `other`) + - Language + - Probability score + - Possible tokenizations + +### Sampling Algorithm + +The sampler uses a probabilistic approach to generate diverse and relevant name suggestions: + +```mermaid +flowchart TD + A[Start] --> B{Enough suggestions?} + B -->|Yes| Z[End] + B -->|No| C{All probabilities = 0?} + C -->|Yes| Z + C -->|No| D[Sample type & language] + D --> E["Sample tokenization"] + E --> F[Sample pipeline] + F --> G{Pipeline exceeds limit?} + G -->|Yes| F + G -->|No| H[Get suggestion from pipeline] + H --> I{Any suggestions left?} + I -->|Yes| J{Already sampled?} + I -->|No| F + J -->|Yes| H + J -->|No| K{Available if required?} + K -->|No| H + K -->|Yes| L{Normalized?} + L -->|No| H + L -->|Yes| B ``` -## Query as an argument +The algorithm works as follows: -The application can be run with - -``` -python ./generator/app.py -``` +1. **Initialization**: For each type-language pair, pipeline probabilities are computed. -It will generate suggestions for the default query. +2. **Main Loop**: The sampler iterates until either: + - Enough suggestions are generated (`max_suggestions` met) + - All pipeline probabilities become zero -The default parameters are defined in `conf/config.yaml`. Any of the parameters might be substituted with a path to the -parameter, with dot-separated fragments, e.g. +3. **Sampling Process**: + - First samples a type and language pair + - Then samples a specific tokenization within that pair + - Selects a pipeline using probability-based sampling + - First pass uses sampling without replacement for diversity + +4. **Validation Checks**: + - Verifies pipeline hasn't exceeded its global limit + - Ensures suggestions aren't duplicates + - Checks availability status if required + - Confirms normalization status -``` -python ./generator/app.py app.query=firepower -``` +5. **Pipeline Management**: + - Exhausted pipelines are removed from the sampling pool + - When a pipeline can't generate more suggestions, falls back to other pipelines -will substitute the default query with the provided one. +This approach ensures a balanced mix of suggestions while maintaining efficiency and respecting all configured constraints. -The parameters are documented in the config. +# Usage -# REST API +NameGraph uses [Poetry](https://python-poetry.org/) for dependency management and packaging. Before getting started, make sure you have Poetry installed on your system. -Start server: -``` -python -m uvicorn web_api:app --reload -``` +## Prerequisites -Query with POST: +Install Poetry if you haven't already: +```bash +curl -sSL https://install.python-poetry.org | python3 - ``` -curl -d '{"label":"fire"}' -H "Content-Type: application/json" -X POST http://localhost:8000 -``` - -# Tests -Run: -``` -pytest -``` -or without slow tests: -``` -pytest -m "not slow" -``` +Visit [Poetry installation guide](https://python-poetry.org/docs/#installation) for more details. -## Debugging +## Install -Run app with `app.logging_level=DEBUG` to see debug information: -``` -python generator/app.py app.input=stdin app.logging_level=DEBUG +Clone the repository and install dependencies: +```bash +git clone https://github.com/namehash/namegraph.git +cd namegraph +poetry install ``` -# Deployment +## Download resources -## Build Docker image locally +Additional resources need to be downloaded. Run these commands within the Poetry environment: -Set image TAG: - -`export TAG=0.1.0` - -Build a Docker image locally +```bash +poetry run python download.py # dictionaries, embeddings +poetry run python download_names.py +``` -`docker compose -f docker-compose.build.yml build` +## Configuration -Authorize to Amazon (if you are using MFA you have to take temporary ACCESS keys from AWS STS): +NameGraph uses [Hydra](https://hydra.cc/) - a framework for elegantly configuring complex applications. The configuration is stored in the `conf/` directory and includes: -`aws configure` +- Main configuration files (`prod_config_new.yaml`, `test_config_new.yaml`) with core settings like connections, filters, limits, and paths +- Pipeline configurations in `conf/pipelines/` defining generators, modes, categories, and language settings -Authorize to ECR: +The configuration is highly modular and can be easily modified to adjust the behavior of name generation, filtering, and ranking systems. -`./authorize-ecr.sh` +## REST API -Push image to ECR: +Start server using Poetry: +```bash +poetry run uvicorn web_api:app --reload +``` -`docker push 571094861812.dkr.ecr.us-east-1.amazonaws.com/name-generator:${TAG} +Query with POST: +```bash +curl -d '{"label":"armstrong"}' -H "Content-Type: application/json" -X POST http://localhost:8000 +``` -## Deploy image on remote instance +Query with POST (pretokenized input): +```bash +curl -d '{"label":"\"arm strong\""}' -H "Content-Type: application/json" -X POST http://localhost:8000 +``` -Set image TAG: +**Note:** pretokenized input should be wrapped in double quotes. -`export TAG=0.1.0 +## Documentation -Authorize EC2 instance in ECR: +The API documentation is available at `/docs` or `/redoc` when the server is running. These are auto-generated Swagger/OpenAPI docs provided by FastAPI that allow you to: -`aws ecr get-login-password | docker login --username AWS --password-stdin 571094861812.dkr.ecr.us-east-1.amazonaws.com/name-generator` +- View all available endpoints +- See request/response schemas +- See descriptions of each parameter and response field +- Test API calls directly from the browser -(Re-Deploy) image: +Public API documentation is available at [api.namegraph.dev/docs](https://api.namegraph.dev/docs). -`docker compose up -d` +## Tests -Check if service works: +Run tests using Poetry: +```bash +poetry run pytest +``` -`curl -d '{"label":"firestarter"}' -H "Content-Type: application/json" -X POST http://44.203.61.202` +Tests that interact with external services (Elasticsearch) are marked with `integration_test` marker and are disabled by default. Define environment variables needed to access Elasticsearch and run them using: +```bash +poetry run pytest -m "integration_test" +``` ## Learning-To-Rank -To access the LTR features, you need to configure it in the Elasticsearch instance (see [here](https://github.com/namehash/collection-templates/tree/master/research/learning-to-rank/readme.md) for more details). - -## Pipelines, weights, sampler - -In `conf/prod_config_new.yaml` are defined `generator_limits` which limits maximum number of suggestions generated by each generator. This is for optimization. E.g.: -```yaml - generator_limits: - HyphenGenerator: 128 - AbbreviationGenerator: 128 - EmojiGenerator: 150 - Wikipedia2VGenerator: 100 - RandomAvailableNameGenerator: 20000 -``` - -In `conf/pipelines/prod_new.yaml` are defined pipelines. Each pipeline have: -* a `name` -* one `generator` -* list of `filters`, e.g. SubnameFilter, ValidNameFilter, ValidNameLengthFilter, DomainFilter -* `weights` for each interpretation type (`ngram`, `person`, `other`) and each language -* `mode_weights_multiplier` - a multiplier of above weights for each mode (e.g. `instant`, `domain_detail`, `full`) -* `global_limits` for each mode, which can be integer (absolute number) or float (percentage of all results); also you can override values for `grouped_by_category` endpoint by adding prefix `grouped_` (e.g. `grouped_instant`, `grouped_domain_detail`, `grouped_full`) - -Setting `0` in `mode_weights_multiplier` or `global_limits` disables the pipeline in a given mode. - -### Sampler - -Each request defines: -* `mode` -* `min_suggestions` -* `max_suggestions` -* `min_available_fraction` - -A name can have many interpretations. Every interpretation has type (`ngram`, `person`, `other`) and language. Every interpretation have a probability. There might be more than one interpretation with the same type and language. - -For each pair of type and language, probabilities of each pipeline are computed. - -1. If there is enough suggestions then break. -2. If all pipeline probabilities for every pair of type nad language are 0 then break. -3. Sample type and language, then sample interpretation within this type and language. -4. Sample a pipeline for the sampled interpretation. The first pass of sampling is without replacement to increase diversity in top suggestions. -5. If the pipeline exceeds its global limit then go to 4. -6. Get a suggestion from the pipeline. (The generator is executed here). If there is no more suggestions then go to 4. -7. If the suggestion have been already sampled then go to 6. -8. If the suggestion is not available and there is room only for available then go to 6. -9. If the suggestion is not normalized then go to 6. -10. Go to 1. - -Exhausted pipelines are removed from sampling. - -### Grouped by category - -Parameters: -* `mode` -* `min_available_fraction` -* max number of categories -* max number of suggestions per category -* max related categories -* min total categories? -* max total categories? - -Requirements: -* order of categories is fixed -* every generator must be mapped to only one category -* flag generator suggestion should appear in 10% of suggestions - maybe we should detect if it is first search by a user - * should we remove first pass of sampling with every generator? - -1. Shuffle order of categories (using weights?) if min number of categories is smaller than all categories. If some category does not return suggestions then we take the next one. -2. Within each category: sample type and lang of interpretation, sample interpretaion with this type and lang. Sample pipeline (weights of pipelines depends on type and language. Do it in parallel? -3. Sample `max number of suggestions per category`. How handle `min_available_fraction`? - -### Suggestions by category - -For each category MetaSampler is created with appropriate pipelines. -In parallel, all MetaSamplers are exectuted. In one MetaSampler: -1. Apply global limits. -2. For each interpretation (interpretation_type, lang, tokenization) a sampler is created. - - -After generation of suggestions for all categories: -1. For each category number of suggestions is limited by category's `max_suggestions`. -2. If `count_real_suggestions` < `min_total_suggestions` then RandomAvailable names are appended as `other` category. +To access the LTR features, you need to configure it in the Elasticsearch instance (see [here](https://github.com/namehash/namegraph-collections/tree/master/research/learning-to-rank/readme.md) for more details). From 0a2634b58432bcfd74387832dd49b41809ef61ac Mon Sep 17 00:00:00 2001 From: Mykola Haltiuk Date: Wed, 12 Mar 2025 17:29:11 +0100 Subject: [PATCH 2/3] temporarily substituted logo for text --- readme.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/readme.md b/readme.md index 2bd831c0..a913a9ce 100644 --- a/readme.md +++ b/readme.md @@ -7,12 +7,13 @@

- + +

NameGraph

From 92c35ef5944d1acd06f5bfdfdf14fb0b95d6772f Mon Sep 17 00:00:00 2001 From: Mykola Haltiuk Date: Wed, 12 Mar 2025 17:30:26 +0100 Subject: [PATCH 3/3] exporting all names from collections --- research/elasticsearch/export-names.py | 58 ++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 research/elasticsearch/export-names.py diff --git a/research/elasticsearch/export-names.py b/research/elasticsearch/export-names.py new file mode 100644 index 00000000..43ce8cde --- /dev/null +++ b/research/elasticsearch/export-names.py @@ -0,0 +1,58 @@ +from elasticsearch import Elasticsearch +from elasticsearch.helpers import scan +from tqdm import tqdm +import os + +# Get credentials from environment variables +ES_SCHEME = os.getenv('ES_SCHEME', 'http') +ES_HOST = os.getenv('ES_HOST') +ES_PORT = int(os.getenv('ES_PORT')) +ES_USERNAME = os.getenv('ES_USERNAME') +ES_PASSWORD = os.getenv('ES_PASSWORD') +ES_INDEX = os.getenv('ES_INDEX') + +# Initialize Elasticsearch client +es = Elasticsearch( + hosts=[{ + 'scheme': ES_SCHEME, + 'host': ES_HOST, + 'port': ES_PORT + }], + http_auth=(ES_USERNAME, ES_PASSWORD), + timeout=60, + http_compress=True, +) + +# Query to get all documents +query = { + "_source": ["data.names.normalized_name"], + "query": { + "match_all": {} + } +} + +# First, count total documents for progress bar +total_docs = es.count(index=ES_INDEX, body={"query": {"match_all": {}}})["count"] + +# Initialize set to store unique names +unique_names = set() + +# Scan through all documents with progress bar +print("Scanning documents...") +with tqdm(total=total_docs, desc="Processing documents") as pbar: + for doc in scan(es, query=query, index=ES_INDEX): + if "data" in doc["_source"] and "names" in doc["_source"]["data"]: + names = doc["_source"]["data"]["names"] + for name in names: + if "normalized_name" in name: + unique_names.add(name["normalized_name"]) + pbar.update(1) + +# Write unique names to file with progress bar +output_file = "exported_names.txt" +print(f"\nWriting {len(unique_names)} unique names to {output_file}...") +with open(output_file, "w", encoding="utf-8") as f: + for name in tqdm(unique_names, desc="Writing names"): + f.write(f"{name}\n") + +print(f"Export complete! {len(unique_names)} unique names written to {output_file}")