diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 6b5e31b..cff3449 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -15,21 +15,15 @@ jobs: compare-php: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v4 + - uses: actions/checkout@v6 + - uses: actions/setup-python@v6 with: python-version: 3.x - # A naive `docker compose up` would first build the `python-api` container and then - # start all services, which kickstarts Elastic Search and building indices. - # But since those two steps are independent, we can parallelize them to save time. - - run: | - docker compose build python-api - docker compose up -d --wait python-api php-api - - run: docker container ls && docker image ls - - run: docker exec python-api python -m pip freeze - - run: docker exec python-api coverage run -m pytest -xv -m "php_api" - - run: docker exec python-api coverage xml + # https://github.com/docker/compose/issues/10596 + - run: docker compose --profile "python" --profile "php" up --detach --wait --remove-orphans || exit $(docker compose ps -q | xargs docker inspect -f '{{.State.ExitCode}}' | grep -v '^0' | wc -l) + - run: docker exec openml-python-rest-api coverage run -m pytest -v -m "php_api" + - run: docker exec openml-python-rest-api coverage xml - name: Upload results to Codecov uses: codecov/codecov-action@v4 with: @@ -37,15 +31,13 @@ jobs: python: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v4 + - uses: actions/checkout@v6 + - uses: actions/setup-python@v6 with: python-version: 3.x - - run: docker compose up -d --wait database python-api - - run: docker container ls && docker image ls - - run: docker exec python-api python -m pip freeze - - run: docker exec python-api coverage run -m pytest -xv -m "not php_api" - - run: docker exec python-api coverage xml + - run: docker compose --profile "python" up --detach --wait --remove-orphans || exit $(docker compose ps -q | xargs docker inspect -f '{{.State.ExitCode}}' | grep -v '^0' | wc -l) + - run: docker exec openml-python-rest-api coverage run -m pytest -v -m "not php_api" + - run: docker exec openml-python-rest-api coverage xml - name: Upload results to Codecov uses: codecov/codecov-action@v4 with: diff --git a/.gitignore b/.gitignore index 75dd10d..5a1ad1b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ docker/mysql/data +.DS_Store # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/docker-compose.yaml b/docker-compose.yaml index 8b0ef56..324350f 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,13 +1,33 @@ services: database: - image: "openml/test-database" + profiles: ["python", "php", "all"] + image: "openml/test-database:20240105" container_name: "openml-test-database" environment: MYSQL_ROOT_PASSWORD: ok ports: - "3306:3306" + healthcheck: + test: ["CMD", "mysqladmin" ,"ping", "-h", "localhost"] + start_period: 30s + start_interval: 1s + timeout: 3s + interval: 5s + retries: 10 + + database-setup: + profiles: ["python", "php", "all"] + image: mysql + container_name: "openml-test-database-setup" + volumes: + - ./docker/database/update.sh:/database-update.sh + command: /bin/sh -c "/database-update.sh" + depends_on: + database: + condition: service_healthy docs: + profiles: ["all"] build: context: . dockerfile: docker/docs/Dockerfile @@ -16,8 +36,35 @@ services: volumes: - .:/docs + elasticsearch: + profiles: ["php", "all"] + image: docker.elastic.co/elasticsearch/elasticsearch:6.8.23 + container_name: "openml-elasticsearch" + platform: "linux/amd64" + ports: + - "9200:9200" # also known as /es (nginx) + - "9300:9300" + env_file: docker/elasticsearch/.env + healthcheck: + test: curl 127.0.0.1:9200/_cluster/health | grep -e "green" + start_period: 30s + start_interval: 5s + timeout: 3s + interval: 10s + deploy: + resources: + limits: + cpus: '1' + memory: 1G + reservations: + cpus: '0.2' + memory: 250M + php-api: - image: "openml/php-rest-api" + profiles: ["php", "all"] + image: "openml/php-rest-api:v1.2.2" + container_name: "openml-php-rest-api" + env_file: docker/php/.env ports: - "8002:80" depends_on: @@ -33,7 +80,8 @@ services: interval: 1m python-api: - container_name: "python-api" + profiles: ["python", "all"] + container_name: "openml-python-rest-api" build: context: . dockerfile: docker/python/Dockerfile @@ -43,20 +91,3 @@ services: - .:/python-api depends_on: - database - - elasticsearch: - image: docker.elastic.co/elasticsearch/elasticsearch:6.8.23 - container_name: "elasticsearch" - ports: - - "9200:9200" - - "9300:9300" - environment: - - ELASTIC_PASSWORD=default - - discovery.type=single-node - - xpack.security.enabled=false - healthcheck: - test: curl 127.0.0.1:9200/_cluster/health | grep -e "green" - start_period: 30s - start_interval: 5s - timeout: 3s - interval: 1m diff --git a/docker/database/update.sh b/docker/database/update.sh new file mode 100755 index 0000000..7c87ca8 --- /dev/null +++ b/docker/database/update.sh @@ -0,0 +1,31 @@ +#/bin/bash +# Change the filepath of openml.file +# from "https://www.openml.org/data/download/1666876/phpFsFYVN" +# to "http://minio:9000/datasets/0000/0001/phpFsFYVN" +mysql -hdatabase -uroot -pok -e 'UPDATE openml.file SET filepath = CONCAT("http://minio:9000/datasets/0000/", LPAD(id, 4, "0"), "/", SUBSTRING_INDEX(filepath, "/", -1)) WHERE extension="arff";' + +# Update openml.expdb.dataset with the same url +mysql -hdatabase -uroot -pok -e 'UPDATE openml_expdb.dataset DS, openml.file FL SET DS.url = FL.filepath WHERE DS.did = FL.id;' + + + + + +# Create the data_feature_description TABLE. TODO: can we make sure this table exists already? +mysql -hdatabase -uroot -pok -Dopenml_expdb -e 'CREATE TABLE IF NOT EXISTS `data_feature_description` ( + `did` int unsigned NOT NULL, + `index` int unsigned NOT NULL, + `uploader` mediumint unsigned NOT NULL, + `date` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + `description_type` enum("plain", "ontology") NOT NULL, + `value` varchar(256) NOT NULL, + KEY `did` (`did`,`index`), + CONSTRAINT `data_feature_description_ibfk_1` FOREIGN KEY (`did`, `index`) REFERENCES `data_feature` (`did`, `index`) ON DELETE CASCADE ON UPDATE CASCADE +)' + +# SET dataset 1 to active (used in unittests java) +mysql -hdatabase -uroot -pok -Dopenml_expdb -e 'INSERT IGNORE INTO dataset_status VALUES (1, "active", "2024-01-01 00:00:00", 1)' +mysql -hdatabase -uroot -pok -Dopenml_expdb -e 'DELETE FROM dataset_status WHERE did = 2 AND status = "deactivated";' + +# Temporary fix in case the database missed the kaggle table. The PHP Rest API expects the table to be there, while indexing. +mysql -hdatabase -uroot -pok -Dopenml_expdb -e 'CREATE TABLE IF NOT EXISTS `kaggle` (`dataset_id` int(11) DEFAULT NULL, `kaggle_link` varchar(500) DEFAULT NULL)' diff --git a/docker/elasticsearch/.env b/docker/elasticsearch/.env new file mode 100644 index 0000000..6e19eaa --- /dev/null +++ b/docker/elasticsearch/.env @@ -0,0 +1,3 @@ +ELASTIC_PASSWORD=default +discovery.type=single-node +xpack.security.enabled=false diff --git a/docker/mysql/README.md b/docker/mysql/README.md index edb3b48..95ead55 100644 --- a/docker/mysql/README.md +++ b/docker/mysql/README.md @@ -16,7 +16,7 @@ which sets: You should be able to connect to it using `mysql`: ```bash - +mysql --host 127.0.0.1 --port 3306 -uroot -pok ``` If you do not have `mysql` installed, you may refer to the MySQL image documentation on how to use the image instead to connect over a docker network if you want to connect diff --git a/docker/php/.env b/docker/php/.env new file mode 100644 index 0000000..ad0bb55 --- /dev/null +++ b/docker/php/.env @@ -0,0 +1,14 @@ +API_KEY=AD000000000000000000000000000000 +BASE_URL=http://php-api:80/ +MINIO_URL=http://minio:9000/ +DB_HOST_OPENML=database:3306 +DB_HOST_EXPDB=database:3306 +DB_USER_OPENML=root +DB_PASS_OPENML=ok +DB_USER_EXPDB_WRITE=root +DB_PASS_EXPDB_WRITE=ok +DB_USER_EXPDB_READ=root +DB_PASS_EXPDB_READ=ok +ES_URL=elasticsearch:9200 +ES_PASSWORD=default +INDEX_ES_DURING_STARTUP=false diff --git a/docker/readme.md b/docker/readme.md index 8fc041f..f31c281 100644 --- a/docker/readme.md +++ b/docker/readme.md @@ -7,39 +7,14 @@ This directory contains the files and information to build the following 5 image - docs: the official [mkdocs-material](https://hub.docker.com/r/squidfunk/mkdocs-material) image but with additional plugins installed required for building the documentation in this project's `/doc` directory. - - [openml/php-rest-api](https://hub.docker.com/r/openml/php-rest-api): image with the - php back-end code, but ran on [feature/elasticsearch8](https://github.com/openml/openml/tree/feature/elasticsearch8) - branch. - python-api: an image of this project, to facilitate development on any platform. - - [openml/elasticsearch8-prebuilt](https://hub.docker.com/r/openml/elasticsearch8-prebuilt): - the default elasticsearch image, but with indices already built on the test database - through invocation of the old php code. Between the prebuilt indices and the baked-in database, when all images have already been pulled, a `docker compose up` step should only take seconds. 🚀 -## Building `openml/elasticsearch8-prebuilt` -The `openml/elasticsearch8-prebuilt` is not made with a Dockerfile, because it requires -steps of running containers, which to the best of my knowledge is not facilitated by -docker (not even through [multi-stage builds](https://docs.docker.com/build/building/multi-stage/)). -So, instead we build the container state locally and then use [`docker commit`](https://docs.docker.com/engine/reference/commandline/commit/). - -1. run `docker compose up`, but with the `elasticsearch` service pointing to - `docker.elastic.co/elasticsearch/elasticsearch:8.10.4` instead of `openml/elasticsearch8-prebuilt`. -2. build the indices from the `php-api` container: - - 1. Connect to the container: `docker exec -it server-api-php-api-1 /bin/bash` - 2. (optional) Edit `/var/www/openml/index.php` and set L56 to `development` instead of `production`, - this will show progress of building the indices, or print out any error that may occur. - 3. Build the indices: `php /var/www/openml/index.php cron build_es_indices` - 4. Exit the container with `exit`. - -3. Make a commit of the elastic search container with prebuilt indices: `docker commit elasticsearch openml/elasticsearch8-prebuilt` -4. Push the image created by the commit: `docker push openml/elasticsearch8-prebuilt` - ## Building for multiple platforms -Following Docker's "[multi-platform images](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwiTutyczsOCAxUUhv0HHe_VA6QQFnoECBAQAQ&url=https%3A%2F%2Fdocs.docker.com%2Fbuild%2Fbuilding%2Fmulti-platform%2F&usg=AOvVaw0YP_mkj5WTYD-0weEfrfDv&opi=89978449)" +Following Docker's "[multi-platform images](https://docs.docker.com/build/building/multi-platform/)" documentation, we can build multi-platform images in a few simple steps: 1. Only the first time, create a docker-container driver: `docker buildx create --name container --driver=docker-container` diff --git a/src/config.py b/src/config.py index 4c97e4f..8a19f04 100644 --- a/src/config.py +++ b/src/config.py @@ -8,6 +8,8 @@ TomlTable = dict[str, typing.Any] +CONFIG_PATH = Path(__file__).parent / "config.toml" + def _apply_defaults_to_siblings(configuration: TomlTable) -> TomlTable: defaults = configuration["defaults"] @@ -19,9 +21,17 @@ def _apply_defaults_to_siblings(configuration: TomlTable) -> TomlTable: @functools.cache -def load_database_configuration(file: Path = Path(__file__).parent / "config.toml") -> TomlTable: - configuration = tomllib.loads(file.read_text()) +def _load_configuration(file: Path) -> TomlTable: + return typing.cast(TomlTable, tomllib.loads(file.read_text())) + + +def load_routing_configuration(file: Path = CONFIG_PATH) -> TomlTable: + return typing.cast(TomlTable, _load_configuration(file)["routing"]) + +@functools.cache +def load_database_configuration(file: Path = CONFIG_PATH) -> TomlTable: + configuration = _load_configuration(file) database_configuration = _apply_defaults_to_siblings( configuration["databases"], ) diff --git a/src/config.toml b/src/config.toml index 732d7ad..0812add 100644 --- a/src/config.toml +++ b/src/config.toml @@ -15,3 +15,7 @@ database="openml_expdb" [databases.openml] database="openml" + +[routing] +minio_url="http://minio:9000/" +server_url="http://php-api:80/" diff --git a/src/core/formatting.py b/src/core/formatting.py index 83e3f89..174261f 100644 --- a/src/core/formatting.py +++ b/src/core/formatting.py @@ -2,7 +2,7 @@ from sqlalchemy.engine import Row -from config import load_configuration +from config import load_routing_configuration from core.errors import DatasetError from schemas.datasets.openml import DatasetFileFormat @@ -25,15 +25,16 @@ def _format_parquet_url(dataset: Row) -> str | None: if dataset.format.lower() != DatasetFileFormat.ARFF: return None - minio_base_url = load_configuration()["minio_base_url"] - prefix = dataset.did // 10_000 - return f"{minio_base_url}/datasets/{prefix:04d}/{dataset.did:04d}/dataset_{dataset.did}.pq" + minio_base_url = load_routing_configuration()["minio_url"] + ten_thousands_prefix = f"{dataset.did // 10_000:04d}" + padded_id = f"{dataset.did:04d}" + return f"{minio_base_url}datasets/{ten_thousands_prefix}/{padded_id}/dataset_{dataset.did}.pq" def _format_dataset_url(dataset: Row) -> str: - base_url = load_configuration()["arff_base_url"] + base_url = load_routing_configuration()["server_url"] filename = f"{html.escape(dataset.name)}.{dataset.format.lower()}" - return f"{base_url}/data/v1/download/{dataset.file_id}/{filename}" + return f"{base_url}data/v1/download/{dataset.file_id}/{filename}" def _safe_unquote(text: str | None) -> str | None: diff --git a/src/main.py b/src/main.py index 85e2ed2..d8e61b3 100644 --- a/src/main.py +++ b/src/main.py @@ -21,18 +21,18 @@ def _parse_args() -> argparse.Namespace: "uvicorn", "arguments forwarded to uvicorn", ) - uvicorn_options.add_argument( + _ = uvicorn_options.add_argument( "--reload", action="store_true", help="Enable auto-reload", ) - uvicorn_options.add_argument( + _ = uvicorn_options.add_argument( "--host", default="127.0.0.1", type=str, help="Bind socket to this host.", ) - uvicorn_options.add_argument( + _ = uvicorn_options.add_argument( "--port", default=8000, type=int, diff --git a/src/routers/openml/flows.py b/src/routers/openml/flows.py index 4eae983..083916b 100644 --- a/src/routers/openml/flows.py +++ b/src/routers/openml/flows.py @@ -49,14 +49,8 @@ def get_flow(flow_id: int, expdb: Annotated[Connection, Depends(expdb_connection ] tags = database.flows.get_tags(flow_id, expdb) - flow_rows = database.flows.get_subflows(for_flow=flow_id, expdb=expdb) - subflows = [ - { - "identifier": flow.identifier, - "flow": get_flow(flow_id=flow.child_id, expdb=expdb), - } - for flow in flow_rows - ] + flow_rows = database.flows.get_subflows(flow_id, expdb) + subflows = [get_flow(flow_id=flow.child_id, expdb=expdb) for flow in flow_rows] return Flow( id_=flow.id, diff --git a/src/routers/openml/tasks.py b/src/routers/openml/tasks.py index 4fcb362..96d0198 100644 --- a/src/routers/openml/tasks.py +++ b/src/routers/openml/tasks.py @@ -7,6 +7,7 @@ from fastapi import APIRouter, Depends, HTTPException from sqlalchemy import Connection, RowMapping, text +import config import database.datasets import database.tasks from routers.dependencies import expdb_connection @@ -139,7 +140,8 @@ def _fill_json_template( # I believe that the operations below are always part of string output, so # we don't need to be careful to avoid losing typedness template = template.replace("[TASK:id]", str(task.task_id)) - return template.replace("[CONSTANT:base_url]", "https://test.openml.org/") + server_url = config.load_routing_configuration()["server_url"] + return template.replace("[CONSTANT:base_url]", server_url) @router.get("/{task_id}") diff --git a/src/schemas/datasets/mldcat_ap.py b/src/schemas/datasets/mldcat_ap.py index 9525431..d7e277f 100644 --- a/src/schemas/datasets/mldcat_ap.py +++ b/src/schemas/datasets/mldcat_ap.py @@ -275,7 +275,7 @@ class DataService(JsonLDObject): class JsonLDGraph(BaseModel): - context: str | dict[str, HttpUrl] = Field(default_factory=dict, serialization_alias="@context") + context: str | dict[str, HttpUrl] = Field(default_factory=dict, serialization_alias="@context") # type: ignore[arg-type] graph: list[Distribution | DataService | Dataset | Quality | Feature | Agent | MD5Checksum] = ( Field(default_factory=list, serialization_alias="@graph") ) diff --git a/src/schemas/flows.py b/src/schemas/flows.py index a6cd479..33dc081 100644 --- a/src/schemas/flows.py +++ b/src/schemas/flows.py @@ -1,7 +1,7 @@ from __future__ import annotations from datetime import datetime -from typing import Any, TypedDict +from typing import Any, Self from pydantic import BaseModel, ConfigDict, Field @@ -25,12 +25,7 @@ class Flow(BaseModel): language: str | None = Field(max_length=128) dependencies: str | None parameter: list[Parameter] - subflows: list[Subflow] + subflows: list[Self] tag: list[str] model_config = ConfigDict(arbitrary_types_allowed=True) - - -class Subflow(TypedDict): - identifier: str | None - flow: Flow diff --git a/tests/conftest.py b/tests/conftest.py index 4d2c2c9..eecc128 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -16,6 +16,8 @@ from main import create_api from routers.dependencies import expdb_connection, userdb_connection +PHP_API_URL = "http://openml-php-rest-api:80/api/v1/json" + @contextlib.contextmanager def automatic_rollback(engine: Engine) -> Iterator[Connection]: @@ -39,8 +41,8 @@ def user_test() -> Connection: @pytest.fixture -def php_api() -> Iterator[httpx.Client]: - with httpx.Client(base_url="http://server-api-php-api-1:80/api/v1/json") as client: +def php_api() -> httpx.Client: + with httpx.Client(base_url=PHP_API_URL) as client: yield client diff --git a/tests/constants.py b/tests/constants.py index e471fd5..6881f88 100644 --- a/tests/constants.py +++ b/tests/constants.py @@ -1,6 +1,6 @@ PRIVATE_DATASET_ID = {130} -IN_PREPARATION_ID = {1, 33} -DEACTIVATED_DATASETS = {2, 131} +IN_PREPARATION_ID = {33} +DEACTIVATED_DATASETS = {131} DATASETS = set(range(1, 132)) NUMBER_OF_DATASETS = len(DATASETS) diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py index 7c1457f..b463d3d 100644 --- a/tests/routers/openml/datasets_test.py +++ b/tests/routers/openml/datasets_test.py @@ -8,6 +8,7 @@ from database.users import User from routers.openml.datasets import get_dataset from schemas.datasets.openml import DatasetMetadata, DatasetStatus +from tests import constants from tests.users import ADMIN_USER, NO_USER, OWNER_USER, SOME_USER, ApiKey @@ -44,14 +45,14 @@ def test_get_dataset(py_api: TestClient) -> None: "description_version": 1, "upload_date": "2014-04-06T23:19:24", "licence": "Public", - "url": "https://test.openml.org/data/v1/download/1/anneal.arff", - "parquet_url": "https://openml1.win.tue.nl/datasets/0000/0001/dataset_1.pq", + "url": "http://php-api/data/v1/download/1/anneal.arff", + "parquet_url": "http://minio:9000/datasets/0000/0001/dataset_1.pq", "file_id": 1, "default_target_attribute": ["class"], "version_label": "1", "tag": ["study_14"], "visibility": "public", - "status": "in_preparation", + "status": "active", "processing_date": "2024-01-04T10:13:59", "md5_checksum": "4eaed8b6ec9d8211024b6c089b064761", "row_id_attribute": [], @@ -222,7 +223,7 @@ def test_dataset_status_update_active_to_deactivated(dataset_id: int, py_api: Te def test_dataset_status_update_in_preparation_to_active(py_api: TestClient) -> None: _assert_status_update_is_successful( apikey=ApiKey.ADMIN, - dataset_id=1, + dataset_id=next(iter(constants.IN_PREPARATION_ID)), status=DatasetStatus.ACTIVE, py_api=py_api, ) @@ -232,7 +233,7 @@ def test_dataset_status_update_in_preparation_to_active(py_api: TestClient) -> N def test_dataset_status_update_in_preparation_to_deactivated(py_api: TestClient) -> None: _assert_status_update_is_successful( apikey=ApiKey.ADMIN, - dataset_id=1, + dataset_id=next(iter(constants.IN_PREPARATION_ID)), status=DatasetStatus.DEACTIVATED, py_api=py_api, ) @@ -242,7 +243,7 @@ def test_dataset_status_update_in_preparation_to_deactivated(py_api: TestClient) def test_dataset_status_update_deactivated_to_active(py_api: TestClient) -> None: _assert_status_update_is_successful( apikey=ApiKey.ADMIN, - dataset_id=131, + dataset_id=next(iter(constants.DEACTIVATED_DATASETS)), status=DatasetStatus.ACTIVE, py_api=py_api, ) diff --git a/tests/routers/openml/flows_test.py b/tests/routers/openml/flows_test.py index 2bf9fc3..611e478 100644 --- a/tests/routers/openml/flows_test.py +++ b/tests/routers/openml/flows_test.py @@ -221,148 +221,141 @@ def test_get_flow_with_subflow(py_api: TestClient) -> None: ], "subflows": [ { - "identifier": None, - "flow": { - "id": 4, - "uploader": 16, - "name": "weka.J48", - "class_name": "weka.classifiers.trees.J48", - "version": 1, - "external_version": "Weka_3.9.0_11194", - "description": ( - "Ross Quinlan (1993). C4.5: Programs for Machine Learning. " - "Morgan Kaufmann Publishers, San Mateo, CA." - ), - "upload_date": "2017-03-24T14:26:40", - "language": "English", - "dependencies": "Weka_3.9.0", - "parameter": [ - { - "name": "-do-not-check-capabilities", - "data_type": "flag", - "default_value": None, - "description": ( - "If set, classifier capabilities are not checked" - " before classifier is built\n\t(use with caution)." - ), - }, - { - "name": "-doNotMakeSplitPointActualValue", - "data_type": "flag", - "default_value": None, - "description": "Do not make split point actual value.", - }, - { - "name": "A", - "data_type": "flag", - "default_value": None, - "description": "Laplace smoothing for predicted probabilities.", - }, - { - "name": "B", - "data_type": "flag", - "default_value": None, - "description": "Use binary splits only.", - }, - { - "name": "C", - "data_type": "option", - "default_value": 0.25, - "description": ( - "Set confidence threshold for pruning.\n\t(default 0.25)" - ), - }, - { - "name": "J", - "data_type": "flag", - "default_value": None, - "description": ( - "Do not use MDL correction for info gain on numeric attributes." - ), - }, - { - "name": "L", - "data_type": "flag", - "default_value": None, - "description": "Do not clean up after the tree has been built.", - }, - { - "name": "M", - "data_type": "option", - "default_value": 2, - "description": ( - "Set minimum number of instances per leaf.\n\t(default 2)" - ), - }, - { - "name": "N", - "data_type": "option", - "default_value": None, - "description": ( - "Set number of folds for reduced error\n\t" - "pruning. One fold is used as pruning set.\n\t(default 3)" - ), - }, - { - "name": "O", - "data_type": "flag", - "default_value": None, - "description": "Do not collapse tree.", - }, - { - "name": "Q", - "data_type": "option", - "default_value": None, - "description": "Seed for random data shuffling (default 1).", - }, - { - "name": "R", - "data_type": "flag", - "default_value": None, - "description": "Use reduced error pruning.", - }, - { - "name": "S", - "data_type": "flag", - "default_value": None, - "description": "Do not perform subtree raising.", - }, - { - "name": "U", - "data_type": "flag", - "default_value": None, - "description": "Use unpruned tree.", - }, - { - "name": "batch-size", - "data_type": "option", - "default_value": None, - "description": ( - "The desired batch size for batch prediction (default 100)." - ), - }, - { - "name": "num-decimal-places", - "data_type": "option", - "default_value": None, - "description": ( - "The number of decimal places for the output of numbers" - " in the model (default 2)." - ), - }, - { - "name": "output-debug-info", - "data_type": "flag", - "default_value": None, - "description": ( - "If set, classifier is run in debug mode and\n\t" - "may output additional info to the console" - ), - }, - ], - "tag": ["OpenmlWeka", "weka"], - "subflows": [], - }, + "id": 4, + "uploader": 16, + "name": "weka.J48", + "class_name": "weka.classifiers.trees.J48", + "version": 1, + "external_version": "Weka_3.9.0_11194", + "description": ( + "Ross Quinlan (1993). C4.5: Programs for Machine Learning. " + "Morgan Kaufmann Publishers, San Mateo, CA." + ), + "upload_date": "2017-03-24T14:26:40", + "language": "English", + "dependencies": "Weka_3.9.0", + "parameter": [ + { + "name": "-do-not-check-capabilities", + "data_type": "flag", + "default_value": None, + "description": ( + "If set, classifier capabilities are not checked" + " before classifier is built\n\t(use with caution)." + ), + }, + { + "name": "-doNotMakeSplitPointActualValue", + "data_type": "flag", + "default_value": None, + "description": "Do not make split point actual value.", + }, + { + "name": "A", + "data_type": "flag", + "default_value": None, + "description": "Laplace smoothing for predicted probabilities.", + }, + { + "name": "B", + "data_type": "flag", + "default_value": None, + "description": "Use binary splits only.", + }, + { + "name": "C", + "data_type": "option", + "default_value": 0.25, + "description": ("Set confidence threshold for pruning.\n\t(default 0.25)"), + }, + { + "name": "J", + "data_type": "flag", + "default_value": None, + "description": ( + "Do not use MDL correction for info gain on numeric attributes." + ), + }, + { + "name": "L", + "data_type": "flag", + "default_value": None, + "description": "Do not clean up after the tree has been built.", + }, + { + "name": "M", + "data_type": "option", + "default_value": 2, + "description": ("Set minimum number of instances per leaf.\n\t(default 2)"), + }, + { + "name": "N", + "data_type": "option", + "default_value": None, + "description": ( + "Set number of folds for reduced error\n\t" + "pruning. One fold is used as pruning set.\n\t(default 3)" + ), + }, + { + "name": "O", + "data_type": "flag", + "default_value": None, + "description": "Do not collapse tree.", + }, + { + "name": "Q", + "data_type": "option", + "default_value": None, + "description": "Seed for random data shuffling (default 1).", + }, + { + "name": "R", + "data_type": "flag", + "default_value": None, + "description": "Use reduced error pruning.", + }, + { + "name": "S", + "data_type": "flag", + "default_value": None, + "description": "Do not perform subtree raising.", + }, + { + "name": "U", + "data_type": "flag", + "default_value": None, + "description": "Use unpruned tree.", + }, + { + "name": "batch-size", + "data_type": "option", + "default_value": None, + "description": ( + "The desired batch size for batch prediction (default 100)." + ), + }, + { + "name": "num-decimal-places", + "data_type": "option", + "default_value": None, + "description": ( + "The number of decimal places for the output of numbers" + " in the model (default 2)." + ), + }, + { + "name": "output-debug-info", + "data_type": "flag", + "default_value": None, + "description": ( + "If set, classifier is run in debug mode and\n\t" + "may output additional info to the console" + ), + }, + ], + "tag": ["OpenmlWeka", "weka"], + "subflows": [], }, ], "tag": ["OpenmlWeka", "weka"], diff --git a/tests/routers/openml/migration/datasets_migration_test.py b/tests/routers/openml/migration/datasets_migration_test.py index 3faca11..812bde7 100644 --- a/tests/routers/openml/migration/datasets_migration_test.py +++ b/tests/routers/openml/migration/datasets_migration_test.py @@ -1,11 +1,11 @@ import json from http import HTTPStatus -import constants import httpx import pytest from starlette.testclient import TestClient +import tests.constants from core.conversions import nested_remove_single_element_list from tests.users import ApiKey @@ -43,6 +43,9 @@ def test_dataset_response_is_identical( # noqa: C901, PLR0912 # The new API has normalized `format` field: original_json["format"] = original_json["format"].lower() + # Pydantic HttpURL serialization omits port 80 for HTTP urls. + original_json["url"] = original_json["url"].replace(":80", "") + # There is odd behavior in the live server that I don't want to recreate: # when the creator is a list of csv names, it can either be a str or a list # depending on whether the names are quoted. E.g.: @@ -127,7 +130,7 @@ def test_private_dataset_owner_access( php_api: TestClient, api_key: str, ) -> None: - [private_dataset] = constants.PRIVATE_DATASET_ID + [private_dataset] = tests.constants.PRIVATE_DATASET_ID new_response = py_api.get(f"/datasets/{private_dataset}?api_key={api_key}") old_response = php_api.get(f"/data/{private_dataset}?api_key={api_key}") assert old_response.status_code == HTTPStatus.OK diff --git a/tests/routers/openml/migration/flows_migration_test.py b/tests/routers/openml/migration/flows_migration_test.py index 674bc43..14d8088 100644 --- a/tests/routers/openml/migration/flows_migration_test.py +++ b/tests/routers/openml/migration/flows_migration_test.py @@ -65,9 +65,7 @@ def convert_flow_naming_and_defaults(flow: dict[str, Any]) -> dict[str, Any]: if parameter["default_value"] is None: parameter["default_value"] = [] for subflow in flow["subflows"]: - subflow["flow"] = convert_flow_naming_and_defaults(subflow["flow"]) - if subflow["identifier"] is None: - subflow["identifier"] = [] + convert_flow_naming_and_defaults(subflow) flow["component"] = flow.pop("subflows") if flow["component"] == []: flow.pop("component") @@ -77,6 +75,8 @@ def convert_flow_naming_and_defaults(flow: dict[str, Any]) -> dict[str, Any]: new = nested_remove_single_element_list(new) expected = php_api.get(f"/flow/{flow_id}").json()["flow"] + if subflow := expected.get("component"): + expected["component"] = subflow["flow"] # The reason we don't transform "new" to str is that it becomes harder to ignore numeric type # differences (e.g., '1.0' vs '1') expected = nested_str_to_num(expected) diff --git a/tests/routers/openml/task_test.py b/tests/routers/openml/task_test.py index 89fc316..d635fdf 100644 --- a/tests/routers/openml/task_test.py +++ b/tests/routers/openml/task_test.py @@ -19,7 +19,7 @@ def test_get_task(py_api: TestClient) -> None: "estimation_procedure": { "id": 5, "type": "holdout", - "data_splits_url": "https://test.openml.org/api_splits/get/59/Task_59_splits.arff", + "data_splits_url": "http://php-api:80/api_splits/get/59/Task_59_splits.arff", "parameter": [ {"name": "number_repeats", "value": 1}, {"name": "number_folds", "value": None},