diff --git a/docs/media/domain-detail.dark.svg b/docs/media/domain-detail.dark.svg new file mode 100644 index 00000000..80301a14 --- /dev/null +++ b/docs/media/domain-detail.dark.svg @@ -0,0 +1,2 @@ + + diff --git a/docs/media/domain-detail.light.svg b/docs/media/domain-detail.light.svg new file mode 100644 index 00000000..4130271f --- /dev/null +++ b/docs/media/domain-detail.light.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/docs/media/full.dark.svg b/docs/media/full.dark.svg new file mode 100644 index 00000000..950f7cae --- /dev/null +++ b/docs/media/full.dark.svg @@ -0,0 +1,31 @@ + + + + + + + + + + diff --git a/docs/media/full.light.svg b/docs/media/full.light.svg new file mode 100644 index 00000000..5ba534aa --- /dev/null +++ b/docs/media/full.light.svg @@ -0,0 +1,31 @@ + + + + + + + + + + \ No newline at end of file diff --git a/docs/media/generators.dark.svg b/docs/media/generators.dark.svg new file mode 100644 index 00000000..077f7772 --- /dev/null +++ b/docs/media/generators.dark.svg @@ -0,0 +1,4 @@ + + + +

Wordplay

Alternates

Emojify

Expand

Community

Go Wild

Other

Suffix

PersonNameExpand

Prefix

Rhymes

Emoji

Keycap

FlagAffix

Symbol

PersonNameEmojify

Permute

Hyphen

Abbreviation

Wikipedia2V

W2V

SubstringMatch

RandomAvailableName

OnSaleMatch

Categories

Leet

Reverse

EasterEgg

SpecialCharacterAffix

WordnetSynonyms

\ No newline at end of file diff --git a/docs/media/generators.light.svg b/docs/media/generators.light.svg new file mode 100644 index 00000000..023566f2 --- /dev/null +++ b/docs/media/generators.light.svg @@ -0,0 +1,4 @@ + + + +

Wordplay

Alternates

Emojify

Expand

Community

Go Wild

Other

Suffix

PersonNameExpand

Prefix

Rhymes

Emoji

Keycap

FlagAffix

Symbol

PersonNameEmojify

Permute

Hyphen

Abbreviation

Wikipedia2V

W2V

SubstringMatch

RandomAvailableName

OnSaleMatch

Categories

Leet

Reverse

EasterEgg

SpecialCharacterAffix

WordnetSynonyms

\ No newline at end of file diff --git a/docs/media/instant.dark.svg b/docs/media/instant.dark.svg new file mode 100644 index 00000000..65841eac --- /dev/null +++ b/docs/media/instant.dark.svg @@ -0,0 +1,2 @@ + + diff --git a/docs/media/instant.light.svg b/docs/media/instant.light.svg new file mode 100644 index 00000000..016c202c --- /dev/null +++ b/docs/media/instant.light.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/docs/media/logo-light.svg b/docs/media/logo-light.svg new file mode 100644 index 00000000..36dee215 --- /dev/null +++ b/docs/media/logo-light.svg @@ -0,0 +1,13 @@ + + + + NameGraph + + + + + + beta + + + diff --git a/docs/old_readme.md b/docs/old_readme.md new file mode 100644 index 00000000..a6066f4c --- /dev/null +++ b/docs/old_readme.md @@ -0,0 +1,202 @@ +![Tests](https://github.com/namehash/namegraph/actions/workflows/ci.yml/badge.svg?branch=master) + +# Install + +``` +pip3 install -e . +``` + +# Usage + +The application can be run: +* reading queries from stdin +* reading query as an argument + +Additional resources need to be downloaded: +``` +python download.py # dictionaries, embeddings +python download_names.py +``` + +## Queries from stdin + +``` +python ./generator/app.py app.input=stdin +``` + +## Query as an argument + +The application can be run with + +``` +python ./generator/app.py +``` + +It will generate suggestions for the default query. + +The default parameters are defined in `conf/config.yaml`. Any of the parameters might be substituted with a path to the +parameter, with dot-separated fragments, e.g. + +``` +python ./generator/app.py app.query=firepower +``` + +will substitute the default query with the provided one. + +The parameters are documented in the config. + +# REST API + +Start server: +``` +python -m uvicorn web_api:app --reload +``` + +Query with POST: +``` +curl -d '{"label":"fire"}' -H "Content-Type: application/json" -X POST http://localhost:8000 +``` + +# Tests + +Run: +``` +pytest +``` +or without slow tests: +``` +pytest -m "not slow" +``` + +## Debugging + +Run app with `app.logging_level=DEBUG` to see debug information: +``` +python generator/app.py app.input=stdin app.logging_level=DEBUG +``` + +# Deployment + +## Build Docker image locally + +Set image TAG: + +`export TAG=0.1.0` + +Build a Docker image locally + +`docker compose -f docker-compose.build.yml build` + +Authorize to Amazon (if you are using MFA you have to take temporary ACCESS keys from AWS STS): + +`aws configure` + +Authorize to ECR: + +`./authorize-ecr.sh` + +Push image to ECR: + +`docker push 571094861812.dkr.ecr.us-east-1.amazonaws.com/name-generator:${TAG} + +## Deploy image on remote instance + +Set image TAG: + +`export TAG=0.1.0 + +Authorize EC2 instance in ECR: + +`aws ecr get-login-password | docker login --username AWS --password-stdin 571094861812.dkr.ecr.us-east-1.amazonaws.com/name-generator` + +(Re-Deploy) image: + +`docker compose up -d` + +Check if service works: + +`curl -d '{"label":"firestarter"}' -H "Content-Type: application/json" -X POST http://44.203.61.202` + +## Learning-To-Rank + +To access the LTR features, you need to configure it in the Elasticsearch instance (see [here](https://github.com/namehash/collection-templates/tree/master/research/learning-to-rank/readme.md) for more details). + +## Pipelines, weights, sampler + +In `conf/prod_config_new.yaml` are defined `generator_limits` which limits maximum number of suggestions generated by each generator. This is for optimization. E.g.: +```yaml + generator_limits: + HyphenGenerator: 128 + AbbreviationGenerator: 128 + EmojiGenerator: 150 + Wikipedia2VGenerator: 100 + RandomAvailableNameGenerator: 20000 +``` + +In `conf/pipelines/prod_new.yaml` are defined pipelines. Each pipeline have: +* a `name` +* one `generator` +* list of `filters`, e.g. SubnameFilter, ValidNameFilter, ValidNameLengthFilter, DomainFilter +* `weights` for each interpretation type (`ngram`, `person`, `other`) and each language +* `mode_weights_multiplier` - a multiplier of above weights for each mode (e.g. `instant`, `domain_detail`, `full`) +* `global_limits` for each mode, which can be integer (absolute number) or float (percentage of all results); also you can override values for `grouped_by_category` endpoint by adding prefix `grouped_` (e.g. `grouped_instant`, `grouped_domain_detail`, `grouped_full`) + +Setting `0` in `mode_weights_multiplier` or `global_limits` disables the pipeline in a given mode. + +### Sampler + +Each request defines: +* `mode` +* `min_suggestions` +* `max_suggestions` +* `min_available_fraction` + +A name can have many interpretations. Every interpretation has type (`ngram`, `person`, `other`) and language. Every interpretation have a probability. There might be more than one interpretation with the same type and language. + +For each pair of type and language, probabilities of each pipeline are computed. + +1. If there is enough suggestions then break. +2. If all pipeline probabilities for every pair of type nad language are 0 then break. +3. Sample type and language, then sample interpretation within this type and language. +4. Sample a pipeline for the sampled interpretation. The first pass of sampling is without replacement to increase diversity in top suggestions. +5. If the pipeline exceeds its global limit then go to 4. +6. Get a suggestion from the pipeline. (The generator is executed here). If there is no more suggestions then go to 4. +7. If the suggestion have been already sampled then go to 6. +8. If the suggestion is not available and there is room only for available then go to 6. +9. If the suggestion is not normalized then go to 6. +10. Go to 1. + +Exhausted pipelines are removed from sampling. + +### Grouped by category + +Parameters: +* `mode` +* `min_available_fraction` +* max number of categories +* max number of suggestions per category +* max related categories +* min total categories? +* max total categories? + +Requirements: +* order of categories is fixed +* every generator must be mapped to only one category +* flag generator suggestion should appear in 10% of suggestions - maybe we should detect if it is first search by a user + * should we remove first pass of sampling with every generator? + +1. Shuffle order of categories (using weights?) if min number of categories is smaller than all categories. If some category does not return suggestions then we take the next one. +2. Within each category: sample type and lang of interpretation, sample interpretaion with this type and lang. Sample pipeline (weights of pipelines depends on type and language. Do it in parallel? +3. Sample `max number of suggestions per category`. How handle `min_available_fraction`? + +### Suggestions by category + +For each category MetaSampler is created with appropriate pipelines. +In parallel, all MetaSamplers are exectuted. In one MetaSampler: +1. Apply global limits. +2. For each interpretation (interpretation_type, lang, tokenization) a sampler is created. + + +After generation of suggestions for all categories: +1. For each category number of suggestions is limited by category's `max_suggestions`. +2. If `count_real_suggestions` < `min_total_suggestions` then RandomAvailable names are appended as `other` category. diff --git a/readme.md b/readme.md index 886e20d1..a913a9ce 100644 --- a/readme.md +++ b/readme.md @@ -1,225 +1,314 @@ -![Tests](https://github.com/namehash/namegraph/actions/workflows/ci.yml/badge.svg?branch=master) -# Install + -``` -pip3 install -e . -``` +
+
+ + + +

+ +

NameGraph

+

-# Usage + +

+ Surf more than 21 million name ideas across more than 400,000 name collections,
or generate infinite related name suggestions. +

-The application can be run: -* reading queries from stdin -* reading query as an argument + +

+ + + + + + Python Build Status + + + + + + Python CI Status + + + + + + status: beta + + +

-Additional resources need to be downloaded: -``` -python download.py # dictionaries, embeddings -python download_names.py -``` +# Project Status -## Queries from stdin +NameGraph is currently in beta. We are excited to share our work with you and continue to build the greatest web of names in history! -``` -python ./generator/app.py app.input=stdin -``` +# Overview -## Query as an argument +NameGraph is a web service that generates name suggestions for a given input label. It is implemented using FastAPI and provides a variety of endpoints to generate suggestions in different modes and with different parameters. -The application can be run with +## Label Analysis -``` -python ./generator/app.py -``` +The input label is analyzed to determine the most relevant name suggestions. The analysis includes: -It will generate suggestions for the default query. +- Defining all possible interpretations of the input label along with their probabilities (whether it is a sequence of common words, a person name, what is the language, etc.) +- For each interpretation, determining most probable tokenizations (e.g. `armstrong` -> `["armstrong"]`, `armstrong` -> `["arm", "strong"]`) -The default parameters are defined in `conf/config.yaml`. Any of the parameters might be substituted with a path to the -parameter, with dot-separated fragments, e.g. +The suggestions are later generated based on these interpretations, tokenizations being especially important, since many generators greatly rely on them. This is why the endpoints can handle pretokenized input. -``` -python ./generator/app.py app.query=firepower -``` +## Collections -will substitute the default query with the provided one. +Collections are curated sets of names that serve as a core component of NameGraph's name suggestion system. The system maintains a vast database of over 400,000 name collections containing more than 21 million unique names. Each collection is stored in Elasticsearch and contains: -The parameters are documented in the config. +- A unique collection ID +- Collection title and description +- Collection rank and metadata +- Member names with their normalized and tokenized forms +- Collection types and categories +- Related collections -# REST API +Collections are used in several key ways: -Start server: -``` -python -m uvicorn web_api:app --reload -``` +1. Direct Name Generation: + - Searches collections based on input tokens + - Uses [learning-to-rank models](#learning-to-rank) to find relevant collections -Query with POST: -``` -curl -d '{"label":"fire"}' -H "Content-Type: application/json" -X POST http://localhost:8000 -``` +2. Related Collections: + - Finds collections with similar themes and content + - Ensures diverse suggestions across different categories -# Tests +3. Membership Lookup: + - Discovers collections containing specific names + - Enables finding thematically related names -Run: -``` -pytest -``` -or without slow tests: -``` -pytest -m "not slow" -``` - -## Debugging - -Run app with `app.logging_level=DEBUG` to see debug information: -``` -python generator/app.py app.input=stdin app.logging_level=DEBUG -``` +The collections are maintained and updated through our [NameGraph Collections](https://github.com/namehash/namegraph-collections) project, ensuring the suggestion database stays current and comprehensive. -# Deployment +## Generators -## Build Docker image locally +Generators are core components that create name suggestions through different methods. Each generator inherits from the base [NameGenerator](namegraph/generation/name_generator.py) class and implements specific name generation strategies. They can be grouped into the categories as shown in the diagram below: -Set image TAG: +

+ + + NameKit + +

+ +## Modes -`export TAG=0.1.0` - -Build a Docker image locally - -`docker compose -f docker-compose.build.yml build` - -Authorize to Amazon (if you are using MFA you have to take temporary ACCESS keys from AWS STS): +NameGraph supports three modes for processing requests: -`aws configure` +- Instant Mode (`instant`): + - Fastest response time + - More basic name generations + - Some advanced generators like W2VGenerator are disabled (weight multiplier = 0) + - Often used for real-time suggestions -Authorize to ECR: +- Domain Detail Mode (`domain_detail`): + - Intermediate between instant and full + - More comprehensive than instant, but still optimized for performance + - Some generators have reduced weights compared to full mode + - Expanded search window for collection ranking and sampling -`./authorize-ecr.sh` +- Full Mode (`full`): + - Most comprehensive name generation + - Includes all enabled generators + - Uses full weights for most generators + - Accesses advanced generators like `Wikipedia2VGenerator` and `W2VGenerator` + - Takes longer to process, but provides the most diverse results -Push image to ECR: -`docker push 571094861812.dkr.ecr.us-east-1.amazonaws.com/name-generator:${TAG}` +Different generators are enabled/disabled for each mode. Take a look at the [generators diagram](#generators) to see which generators are available in each mode. + +
+ + + + + + + + + + + + + + + + + + + + + +
IconModeDescription
InstantInstantFastest response, basic generators only
Domain DetailDomain DetailBalanced speed/quality, expanded search
FullFullComprehensive generation with all generators
+
+ +## Sampler + +The sampler is a sophisticated component that manages the selection and generation of name suggestions. It implements a probabilistic sampling algorithm that balances diversity, relevance, and efficiency while respecting various constraints. + +### Key Components + +- **Request Parameters**: + - `mode`: Determines which generators are active (`instant`/`domain_detail`/`full`) + - `min_suggestions`: Minimum number of suggestions to return + - `max_suggestions`: Maximum number of suggestions to return + - `min_available_fraction`: Minimum fraction of suggestions that must be available + +- **Interpretations**: Each input name can have multiple interpretations, characterized by: + - Type (`ngram`, `person`, `other`) + - Language + - Probability score + - Possible tokenizations + +### Sampling Algorithm + +The sampler uses a probabilistic approach to generate diverse and relevant name suggestions: + +```mermaid +flowchart TD + A[Start] --> B{Enough suggestions?} + B -->|Yes| Z[End] + B -->|No| C{All probabilities = 0?} + C -->|Yes| Z + C -->|No| D[Sample type & language] + D --> E["Sample tokenization"] + E --> F[Sample pipeline] + F --> G{Pipeline exceeds limit?} + G -->|Yes| F + G -->|No| H[Get suggestion from pipeline] + H --> I{Any suggestions left?} + I -->|Yes| J{Already sampled?} + I -->|No| F + J -->|Yes| H + J -->|No| K{Available if required?} + K -->|No| H + K -->|Yes| L{Normalized?} + L -->|No| H + L -->|Yes| B +``` -## Deploy image on remote instance +The algorithm works as follows: -Set image TAG: +1. **Initialization**: For each type-language pair, pipeline probabilities are computed. -`export TAG=0.1.0` +2. **Main Loop**: The sampler iterates until either: + - Enough suggestions are generated (`max_suggestions` met) + - All pipeline probabilities become zero -Authorize EC2 instance in ECR: +3. **Sampling Process**: + - First samples a type and language pair + - Then samples a specific tokenization within that pair + - Selects a pipeline using probability-based sampling + - First pass uses sampling without replacement for diversity + +4. **Validation Checks**: + - Verifies pipeline hasn't exceeded its global limit + - Ensures suggestions aren't duplicates + - Checks availability status if required + - Confirms normalization status -`aws ecr get-login-password | docker login --username AWS --password-stdin 571094861812.dkr.ecr.us-east-1.amazonaws.com/name-generator` +5. **Pipeline Management**: + - Exhausted pipelines are removed from the sampling pool + - When a pipeline can't generate more suggestions, falls back to other pipelines -(Re-Deploy) image: +This approach ensures a balanced mix of suggestions while maintaining efficiency and respecting all configured constraints. -`docker compose up -d` +# Usage -Check if service works: +NameGraph uses [Poetry](https://python-poetry.org/) for dependency management and packaging. Before getting started, make sure you have Poetry installed on your system. -`curl -d '{"label":"firestarter"}' -H "Content-Type: application/json" -X POST http://44.203.61.202` +## Prerequisites -## Learning-To-Rank +Install Poetry if you haven't already: +```bash +curl -sSL https://install.python-poetry.org | python3 - +``` -To access the LTR features, you need to configure it in the Elasticsearch instance (see [here](https://github.com/namehash/collection-templates/tree/master/research/learning-to-rank/readme.md) for more details). +Visit [Poetry installation guide](https://python-poetry.org/docs/#installation) for more details. -## Pipelines, weights, sampler +## Install -In `conf/prod_config_new.yaml` are defined `generator_limits` which limits maximum number of suggestions generated by each generator. This is for optimization. E.g.: -```yaml - generator_limits: - HyphenGenerator: 128 - AbbreviationGenerator: 128 - EmojiGenerator: 150 - Wikipedia2VGenerator: 100 - RandomAvailableNameGenerator: 20000 +Clone the repository and install dependencies: +```bash +git clone https://github.com/namehash/namegraph.git +cd namegraph +poetry install ``` -In `conf/pipelines/prod_new.yaml` are defined pipelines. Each pipeline have: -* a `name` -* one `generator` -* list of `filters`, e.g. SubnameFilter, ValidNameFilter, ValidNameLengthFilter, DomainFilter -* `weights` for each interpretation type (`ngram`, `person`, `other`) and each language -* `mode_weights_multiplier` - a multiplier of above weights for each mode (e.g. `instant`, `domain_detail`, `full`) -* `global_limits` for each mode, which can be integer (absolute number) or float (percentage of all results); also you can override values for `grouped_by_category` endpoint by adding prefix `grouped_` (e.g. `grouped_instant`, `grouped_domain_detail`, `grouped_full`) +## Download resources -Setting `0` in `mode_weights_multiplier` or `global_limits` disables the pipeline in a given mode. +Additional resources need to be downloaded. Run these commands within the Poetry environment: -### Modes +```bash +poetry run python download.py # dictionaries, embeddings +poetry run python download_names.py +``` -NameGraph supports three modes for processing requests: +## Configuration -- Instant Mode (`instant`): - - Fastest response time - - More basic name generations - - Some advanced generators like W2VGenerator are disabled (weight multiplier = 0) - - Often used for real-time suggestions +NameGraph uses [Hydra](https://hydra.cc/) - a framework for elegantly configuring complex applications. The configuration is stored in the `conf/` directory and includes: -- Domain Detail Mode (`domain_detail`): - - Intermediate between instant and full - - More comprehensive than instant, but still optimized for performance - - Some generators have reduced weights compared to full mode - - Expanded search window for collection ranking and sampling +- Main configuration files (`prod_config_new.yaml`, `test_config_new.yaml`) with core settings like connections, filters, limits, and paths +- Pipeline configurations in `conf/pipelines/` defining generators, modes, categories, and language settings -- Full Mode (`full`): - - Most comprehensive name generation - - Includes all enabled generators - - Uses full weights for most generators - - Accesses advanced generators like `Wikipedia2VGenerator` and `W2VGenerator` - - Takes longer to process, but provides the most diverse results +The configuration is highly modular and can be easily modified to adjust the behavior of name generation, filtering, and ranking systems. -### Sampler +## REST API -Each request defines: -* `mode` -* `min_suggestions` -* `max_suggestions` -* `min_available_fraction` +Start server using Poetry: +```bash +poetry run uvicorn web_api:app --reload +``` -A name can have many interpretations. Every interpretation has type (`ngram`, `person`, `other`) and language. Every interpretation have a probability. There might be more than one interpretation with the same type and language. +Query with POST: +```bash +curl -d '{"label":"armstrong"}' -H "Content-Type: application/json" -X POST http://localhost:8000 +``` -For each pair of type and language, probabilities of each pipeline are computed. +Query with POST (pretokenized input): +```bash +curl -d '{"label":"\"arm strong\""}' -H "Content-Type: application/json" -X POST http://localhost:8000 +``` -1. If there is enough suggestions then break. -2. If all pipeline probabilities for every pair of type nad language are 0 then break. -3. Sample type and language, then sample interpretation within this type and language. -4. Sample a pipeline for the sampled interpretation. The first pass of sampling is without replacement to increase diversity in top suggestions. -5. If the pipeline exceeds its global limit then go to 4. -6. Get a suggestion from the pipeline. (The generator is executed here). If there is no more suggestions then go to 4. -7. If the suggestion have been already sampled then go to 6. -8. If the suggestion is not available and there is room only for available then go to 6. -9. If the suggestion is not normalized then go to 6. -10. Go to 1. +**Note:** pretokenized input should be wrapped in double quotes. -Exhausted pipelines are removed from sampling. +## Documentation -### Grouped by category +The API documentation is available at `/docs` or `/redoc` when the server is running. These are auto-generated Swagger/OpenAPI docs provided by FastAPI that allow you to: -Parameters: -* `mode` -* `min_available_fraction` -* max number of categories -* max number of suggestions per category -* max related categories -* min total categories? -* max total categories? +- View all available endpoints +- See request/response schemas +- See descriptions of each parameter and response field +- Test API calls directly from the browser -Requirements: -* order of categories is fixed -* every generator must be mapped to only one category -* flag generator suggestion should appear in 10% of suggestions - maybe we should detect if it is first search by a user - * should we remove first pass of sampling with every generator? +Public API documentation is available at [api.namegraph.dev/docs](https://api.namegraph.dev/docs). -1. Shuffle order of categories (using weights?) if min number of categories is smaller than all categories. If some category does not return suggestions then we take the next one. -2. Within each category: sample type and lang of interpretation, sample interpretaion with this type and lang. Sample pipeline (weights of pipelines depends on type and language. Do it in parallel? -3. Sample `max number of suggestions per category`. How handle `min_available_fraction`? +## Tests -### Suggestions by category +Run tests using Poetry: +```bash +poetry run pytest +``` -For each category MetaSampler is created with appropriate pipelines. -In parallel, all MetaSamplers are exectuted. In one MetaSampler: -1. Apply global limits. -2. For each interpretation (interpretation_type, lang, tokenization) a sampler is created. +Tests that interact with external services (Elasticsearch) are marked with `integration_test` marker and are disabled by default. Define environment variables needed to access Elasticsearch and run them using: +```bash +poetry run pytest -m "integration_test" +``` +## Learning-To-Rank -After generation of suggestions for all categories: -1. For each category number of suggestions is limited by category's `max_suggestions`. -2. If `count_real_suggestions` < `min_total_suggestions` then RandomAvailable names are appended as `other` category. +To access the LTR features, you need to configure it in the Elasticsearch instance (see [here](https://github.com/namehash/namegraph-collections/tree/master/research/learning-to-rank/readme.md) for more details). diff --git a/research/elasticsearch/export-names.py b/research/elasticsearch/export-names.py new file mode 100644 index 00000000..43ce8cde --- /dev/null +++ b/research/elasticsearch/export-names.py @@ -0,0 +1,58 @@ +from elasticsearch import Elasticsearch +from elasticsearch.helpers import scan +from tqdm import tqdm +import os + +# Get credentials from environment variables +ES_SCHEME = os.getenv('ES_SCHEME', 'http') +ES_HOST = os.getenv('ES_HOST') +ES_PORT = int(os.getenv('ES_PORT')) +ES_USERNAME = os.getenv('ES_USERNAME') +ES_PASSWORD = os.getenv('ES_PASSWORD') +ES_INDEX = os.getenv('ES_INDEX') + +# Initialize Elasticsearch client +es = Elasticsearch( + hosts=[{ + 'scheme': ES_SCHEME, + 'host': ES_HOST, + 'port': ES_PORT + }], + http_auth=(ES_USERNAME, ES_PASSWORD), + timeout=60, + http_compress=True, +) + +# Query to get all documents +query = { + "_source": ["data.names.normalized_name"], + "query": { + "match_all": {} + } +} + +# First, count total documents for progress bar +total_docs = es.count(index=ES_INDEX, body={"query": {"match_all": {}}})["count"] + +# Initialize set to store unique names +unique_names = set() + +# Scan through all documents with progress bar +print("Scanning documents...") +with tqdm(total=total_docs, desc="Processing documents") as pbar: + for doc in scan(es, query=query, index=ES_INDEX): + if "data" in doc["_source"] and "names" in doc["_source"]["data"]: + names = doc["_source"]["data"]["names"] + for name in names: + if "normalized_name" in name: + unique_names.add(name["normalized_name"]) + pbar.update(1) + +# Write unique names to file with progress bar +output_file = "exported_names.txt" +print(f"\nWriting {len(unique_names)} unique names to {output_file}...") +with open(output_file, "w", encoding="utf-8") as f: + for name in tqdm(unique_names, desc="Writing names"): + f.write(f"{name}\n") + +print(f"Export complete! {len(unique_names)} unique names written to {output_file}")