From 9a3dadd89da13adc4d71e87bed3a5a77b7680234 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Tue, 9 Dec 2025 11:22:11 +0100
Subject: [PATCH 1/4] Exclude DS_Store files
---
.gitignore | 1 +
1 file changed, 1 insertion(+)
diff --git a/.gitignore b/.gitignore
index 75dd10d..5a1ad1b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
docker/mysql/data
+.DS_Store
# Byte-compiled / optimized / DLL files
__pycache__/
From 4f323eff90e2d2fe2d6f5dc94f899e815f55f553 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Wed, 10 Dec 2025 10:18:11 +0100
Subject: [PATCH 2/4] Ignore return values explicitly
---
src/main.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/main.py b/src/main.py
index 85e2ed2..d8e61b3 100644
--- a/src/main.py
+++ b/src/main.py
@@ -21,18 +21,18 @@ def _parse_args() -> argparse.Namespace:
"uvicorn",
"arguments forwarded to uvicorn",
)
- uvicorn_options.add_argument(
+ _ = uvicorn_options.add_argument(
"--reload",
action="store_true",
help="Enable auto-reload",
)
- uvicorn_options.add_argument(
+ _ = uvicorn_options.add_argument(
"--host",
default="127.0.0.1",
type=str,
help="Bind socket to this host.",
)
- uvicorn_options.add_argument(
+ _ = uvicorn_options.add_argument(
"--port",
default=8000,
type=int,
From 7c0d5c3b642efd8b8acfdaa1ab830db77a993fe9 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Wed, 10 Dec 2025 10:42:51 +0100
Subject: [PATCH 3/4] Remove old PHP docker
---
docker/mysql/README.md | 2 +-
docker/readme.md | 27 +--------------------------
2 files changed, 2 insertions(+), 27 deletions(-)
diff --git a/docker/mysql/README.md b/docker/mysql/README.md
index edb3b48..95ead55 100644
--- a/docker/mysql/README.md
+++ b/docker/mysql/README.md
@@ -16,7 +16,7 @@ which sets:
You should be able to connect to it using `mysql`:
```bash
-
+mysql --host 127.0.0.1 --port 3306 -uroot -pok
```
If you do not have `mysql` installed, you may refer to the MySQL image documentation on
how to use the image instead to connect over a docker network if you want to connect
diff --git a/docker/readme.md b/docker/readme.md
index 8fc041f..f31c281 100644
--- a/docker/readme.md
+++ b/docker/readme.md
@@ -7,39 +7,14 @@ This directory contains the files and information to build the following 5 image
- docs: the official [mkdocs-material](https://hub.docker.com/r/squidfunk/mkdocs-material)
image but with additional plugins installed required for building the documentation
in this project's `/doc` directory.
- - [openml/php-rest-api](https://hub.docker.com/r/openml/php-rest-api): image with the
- php back-end code, but ran on [feature/elasticsearch8](https://github.com/openml/openml/tree/feature/elasticsearch8)
- branch.
- python-api: an image of this project, to facilitate development on any platform.
- - [openml/elasticsearch8-prebuilt](https://hub.docker.com/r/openml/elasticsearch8-prebuilt):
- the default elasticsearch image, but with indices already built on the test database
- through invocation of the old php code.
Between the prebuilt indices and the baked-in database, when all images have already been
pulled, a `docker compose up` step should only take seconds. 🚀
-## Building `openml/elasticsearch8-prebuilt`
-The `openml/elasticsearch8-prebuilt` is not made with a Dockerfile, because it requires
-steps of running containers, which to the best of my knowledge is not facilitated by
-docker (not even through [multi-stage builds](https://docs.docker.com/build/building/multi-stage/)).
-So, instead we build the container state locally and then use [`docker commit`](https://docs.docker.com/engine/reference/commandline/commit/).
-
-1. run `docker compose up`, but with the `elasticsearch` service pointing to
- `docker.elastic.co/elasticsearch/elasticsearch:8.10.4` instead of `openml/elasticsearch8-prebuilt`.
-2. build the indices from the `php-api` container:
-
- 1. Connect to the container: `docker exec -it server-api-php-api-1 /bin/bash`
- 2. (optional) Edit `/var/www/openml/index.php` and set L56 to `development` instead of `production`,
- this will show progress of building the indices, or print out any error that may occur.
- 3. Build the indices: `php /var/www/openml/index.php cron build_es_indices`
- 4. Exit the container with `exit`.
-
-3. Make a commit of the elastic search container with prebuilt indices: `docker commit elasticsearch openml/elasticsearch8-prebuilt`
-4. Push the image created by the commit: `docker push openml/elasticsearch8-prebuilt`
-
## Building for multiple platforms
-Following Docker's "[multi-platform images](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwiTutyczsOCAxUUhv0HHe_VA6QQFnoECBAQAQ&url=https%3A%2F%2Fdocs.docker.com%2Fbuild%2Fbuilding%2Fmulti-platform%2F&usg=AOvVaw0YP_mkj5WTYD-0weEfrfDv&opi=89978449)"
+Following Docker's "[multi-platform images](https://docs.docker.com/build/building/multi-platform/)"
documentation, we can build multi-platform images in a few simple steps:
1. Only the first time, create a docker-container driver: `docker buildx create --name container --driver=docker-container`
From 813927afb347e06756189adf4786dbdc5bd73d6e Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Fri, 19 Dec 2025 09:33:20 +0100
Subject: [PATCH 4/4] Make tests pass with openml-services (#217)
Currently still maintain the relevant definition files in this repository to allow them to change independently for a little while when the server is under most active development. We can then consider which changes should be merged to services to reduce duplication again.
---
.github/workflows/tests.yml | 30 +-
docker-compose.yaml | 71 +++--
docker/database/update.sh | 31 ++
docker/elasticsearch/.env | 3 +
docker/php/.env | 14 +
src/config.py | 14 +-
src/config.toml | 4 +
src/core/formatting.py | 13 +-
src/routers/openml/flows.py | 10 +-
src/routers/openml/tasks.py | 4 +-
src/schemas/datasets/mldcat_ap.py | 2 +-
src/schemas/flows.py | 9 +-
tests/conftest.py | 6 +-
tests/constants.py | 4 +-
tests/routers/openml/datasets_test.py | 13 +-
tests/routers/openml/flows_test.py | 277 +++++++++---------
.../migration/datasets_migration_test.py | 7 +-
.../openml/migration/flows_migration_test.py | 6 +-
tests/routers/openml/task_test.py | 2 +-
19 files changed, 298 insertions(+), 222 deletions(-)
create mode 100755 docker/database/update.sh
create mode 100644 docker/elasticsearch/.env
create mode 100644 docker/php/.env
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 6b5e31b..cff3449 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -15,21 +15,15 @@ jobs:
compare-php:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v4
- - uses: actions/setup-python@v4
+ - uses: actions/checkout@v6
+ - uses: actions/setup-python@v6
with:
python-version: 3.x
- # A naive `docker compose up` would first build the `python-api` container and then
- # start all services, which kickstarts Elastic Search and building indices.
- # But since those two steps are independent, we can parallelize them to save time.
- - run: |
- docker compose build python-api
- docker compose up -d --wait python-api php-api
- - run: docker container ls && docker image ls
- - run: docker exec python-api python -m pip freeze
- - run: docker exec python-api coverage run -m pytest -xv -m "php_api"
- - run: docker exec python-api coverage xml
+ # https://github.com/docker/compose/issues/10596
+ - run: docker compose --profile "python" --profile "php" up --detach --wait --remove-orphans || exit $(docker compose ps -q | xargs docker inspect -f '{{.State.ExitCode}}' | grep -v '^0' | wc -l)
+ - run: docker exec openml-python-rest-api coverage run -m pytest -v -m "php_api"
+ - run: docker exec openml-python-rest-api coverage xml
- name: Upload results to Codecov
uses: codecov/codecov-action@v4
with:
@@ -37,15 +31,13 @@ jobs:
python:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v4
- - uses: actions/setup-python@v4
+ - uses: actions/checkout@v6
+ - uses: actions/setup-python@v6
with:
python-version: 3.x
- - run: docker compose up -d --wait database python-api
- - run: docker container ls && docker image ls
- - run: docker exec python-api python -m pip freeze
- - run: docker exec python-api coverage run -m pytest -xv -m "not php_api"
- - run: docker exec python-api coverage xml
+ - run: docker compose --profile "python" up --detach --wait --remove-orphans || exit $(docker compose ps -q | xargs docker inspect -f '{{.State.ExitCode}}' | grep -v '^0' | wc -l)
+ - run: docker exec openml-python-rest-api coverage run -m pytest -v -m "not php_api"
+ - run: docker exec openml-python-rest-api coverage xml
- name: Upload results to Codecov
uses: codecov/codecov-action@v4
with:
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 8b0ef56..324350f 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -1,13 +1,33 @@
services:
database:
- image: "openml/test-database"
+ profiles: ["python", "php", "all"]
+ image: "openml/test-database:20240105"
container_name: "openml-test-database"
environment:
MYSQL_ROOT_PASSWORD: ok
ports:
- "3306:3306"
+ healthcheck:
+ test: ["CMD", "mysqladmin" ,"ping", "-h", "localhost"]
+ start_period: 30s
+ start_interval: 1s
+ timeout: 3s
+ interval: 5s
+ retries: 10
+
+ database-setup:
+ profiles: ["python", "php", "all"]
+ image: mysql
+ container_name: "openml-test-database-setup"
+ volumes:
+ - ./docker/database/update.sh:/database-update.sh
+ command: /bin/sh -c "/database-update.sh"
+ depends_on:
+ database:
+ condition: service_healthy
docs:
+ profiles: ["all"]
build:
context: .
dockerfile: docker/docs/Dockerfile
@@ -16,8 +36,35 @@ services:
volumes:
- .:/docs
+ elasticsearch:
+ profiles: ["php", "all"]
+ image: docker.elastic.co/elasticsearch/elasticsearch:6.8.23
+ container_name: "openml-elasticsearch"
+ platform: "linux/amd64"
+ ports:
+ - "9200:9200" # also known as /es (nginx)
+ - "9300:9300"
+ env_file: docker/elasticsearch/.env
+ healthcheck:
+ test: curl 127.0.0.1:9200/_cluster/health | grep -e "green"
+ start_period: 30s
+ start_interval: 5s
+ timeout: 3s
+ interval: 10s
+ deploy:
+ resources:
+ limits:
+ cpus: '1'
+ memory: 1G
+ reservations:
+ cpus: '0.2'
+ memory: 250M
+
php-api:
- image: "openml/php-rest-api"
+ profiles: ["php", "all"]
+ image: "openml/php-rest-api:v1.2.2"
+ container_name: "openml-php-rest-api"
+ env_file: docker/php/.env
ports:
- "8002:80"
depends_on:
@@ -33,7 +80,8 @@ services:
interval: 1m
python-api:
- container_name: "python-api"
+ profiles: ["python", "all"]
+ container_name: "openml-python-rest-api"
build:
context: .
dockerfile: docker/python/Dockerfile
@@ -43,20 +91,3 @@ services:
- .:/python-api
depends_on:
- database
-
- elasticsearch:
- image: docker.elastic.co/elasticsearch/elasticsearch:6.8.23
- container_name: "elasticsearch"
- ports:
- - "9200:9200"
- - "9300:9300"
- environment:
- - ELASTIC_PASSWORD=default
- - discovery.type=single-node
- - xpack.security.enabled=false
- healthcheck:
- test: curl 127.0.0.1:9200/_cluster/health | grep -e "green"
- start_period: 30s
- start_interval: 5s
- timeout: 3s
- interval: 1m
diff --git a/docker/database/update.sh b/docker/database/update.sh
new file mode 100755
index 0000000..7c87ca8
--- /dev/null
+++ b/docker/database/update.sh
@@ -0,0 +1,31 @@
+#/bin/bash
+# Change the filepath of openml.file
+# from "https://www.openml.org/data/download/1666876/phpFsFYVN"
+# to "http://minio:9000/datasets/0000/0001/phpFsFYVN"
+mysql -hdatabase -uroot -pok -e 'UPDATE openml.file SET filepath = CONCAT("http://minio:9000/datasets/0000/", LPAD(id, 4, "0"), "/", SUBSTRING_INDEX(filepath, "/", -1)) WHERE extension="arff";'
+
+# Update openml.expdb.dataset with the same url
+mysql -hdatabase -uroot -pok -e 'UPDATE openml_expdb.dataset DS, openml.file FL SET DS.url = FL.filepath WHERE DS.did = FL.id;'
+
+
+
+
+
+# Create the data_feature_description TABLE. TODO: can we make sure this table exists already?
+mysql -hdatabase -uroot -pok -Dopenml_expdb -e 'CREATE TABLE IF NOT EXISTS `data_feature_description` (
+ `did` int unsigned NOT NULL,
+ `index` int unsigned NOT NULL,
+ `uploader` mediumint unsigned NOT NULL,
+ `date` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
+ `description_type` enum("plain", "ontology") NOT NULL,
+ `value` varchar(256) NOT NULL,
+ KEY `did` (`did`,`index`),
+ CONSTRAINT `data_feature_description_ibfk_1` FOREIGN KEY (`did`, `index`) REFERENCES `data_feature` (`did`, `index`) ON DELETE CASCADE ON UPDATE CASCADE
+)'
+
+# SET dataset 1 to active (used in unittests java)
+mysql -hdatabase -uroot -pok -Dopenml_expdb -e 'INSERT IGNORE INTO dataset_status VALUES (1, "active", "2024-01-01 00:00:00", 1)'
+mysql -hdatabase -uroot -pok -Dopenml_expdb -e 'DELETE FROM dataset_status WHERE did = 2 AND status = "deactivated";'
+
+# Temporary fix in case the database missed the kaggle table. The PHP Rest API expects the table to be there, while indexing.
+mysql -hdatabase -uroot -pok -Dopenml_expdb -e 'CREATE TABLE IF NOT EXISTS `kaggle` (`dataset_id` int(11) DEFAULT NULL, `kaggle_link` varchar(500) DEFAULT NULL)'
diff --git a/docker/elasticsearch/.env b/docker/elasticsearch/.env
new file mode 100644
index 0000000..6e19eaa
--- /dev/null
+++ b/docker/elasticsearch/.env
@@ -0,0 +1,3 @@
+ELASTIC_PASSWORD=default
+discovery.type=single-node
+xpack.security.enabled=false
diff --git a/docker/php/.env b/docker/php/.env
new file mode 100644
index 0000000..ad0bb55
--- /dev/null
+++ b/docker/php/.env
@@ -0,0 +1,14 @@
+API_KEY=AD000000000000000000000000000000
+BASE_URL=http://php-api:80/
+MINIO_URL=http://minio:9000/
+DB_HOST_OPENML=database:3306
+DB_HOST_EXPDB=database:3306
+DB_USER_OPENML=root
+DB_PASS_OPENML=ok
+DB_USER_EXPDB_WRITE=root
+DB_PASS_EXPDB_WRITE=ok
+DB_USER_EXPDB_READ=root
+DB_PASS_EXPDB_READ=ok
+ES_URL=elasticsearch:9200
+ES_PASSWORD=default
+INDEX_ES_DURING_STARTUP=false
diff --git a/src/config.py b/src/config.py
index 4c97e4f..8a19f04 100644
--- a/src/config.py
+++ b/src/config.py
@@ -8,6 +8,8 @@
TomlTable = dict[str, typing.Any]
+CONFIG_PATH = Path(__file__).parent / "config.toml"
+
def _apply_defaults_to_siblings(configuration: TomlTable) -> TomlTable:
defaults = configuration["defaults"]
@@ -19,9 +21,17 @@ def _apply_defaults_to_siblings(configuration: TomlTable) -> TomlTable:
@functools.cache
-def load_database_configuration(file: Path = Path(__file__).parent / "config.toml") -> TomlTable:
- configuration = tomllib.loads(file.read_text())
+def _load_configuration(file: Path) -> TomlTable:
+ return typing.cast(TomlTable, tomllib.loads(file.read_text()))
+
+
+def load_routing_configuration(file: Path = CONFIG_PATH) -> TomlTable:
+ return typing.cast(TomlTable, _load_configuration(file)["routing"])
+
+@functools.cache
+def load_database_configuration(file: Path = CONFIG_PATH) -> TomlTable:
+ configuration = _load_configuration(file)
database_configuration = _apply_defaults_to_siblings(
configuration["databases"],
)
diff --git a/src/config.toml b/src/config.toml
index 732d7ad..0812add 100644
--- a/src/config.toml
+++ b/src/config.toml
@@ -15,3 +15,7 @@ database="openml_expdb"
[databases.openml]
database="openml"
+
+[routing]
+minio_url="http://minio:9000/"
+server_url="http://php-api:80/"
diff --git a/src/core/formatting.py b/src/core/formatting.py
index 83e3f89..174261f 100644
--- a/src/core/formatting.py
+++ b/src/core/formatting.py
@@ -2,7 +2,7 @@
from sqlalchemy.engine import Row
-from config import load_configuration
+from config import load_routing_configuration
from core.errors import DatasetError
from schemas.datasets.openml import DatasetFileFormat
@@ -25,15 +25,16 @@ def _format_parquet_url(dataset: Row) -> str | None:
if dataset.format.lower() != DatasetFileFormat.ARFF:
return None
- minio_base_url = load_configuration()["minio_base_url"]
- prefix = dataset.did // 10_000
- return f"{minio_base_url}/datasets/{prefix:04d}/{dataset.did:04d}/dataset_{dataset.did}.pq"
+ minio_base_url = load_routing_configuration()["minio_url"]
+ ten_thousands_prefix = f"{dataset.did // 10_000:04d}"
+ padded_id = f"{dataset.did:04d}"
+ return f"{minio_base_url}datasets/{ten_thousands_prefix}/{padded_id}/dataset_{dataset.did}.pq"
def _format_dataset_url(dataset: Row) -> str:
- base_url = load_configuration()["arff_base_url"]
+ base_url = load_routing_configuration()["server_url"]
filename = f"{html.escape(dataset.name)}.{dataset.format.lower()}"
- return f"{base_url}/data/v1/download/{dataset.file_id}/{filename}"
+ return f"{base_url}data/v1/download/{dataset.file_id}/{filename}"
def _safe_unquote(text: str | None) -> str | None:
diff --git a/src/routers/openml/flows.py b/src/routers/openml/flows.py
index 4eae983..083916b 100644
--- a/src/routers/openml/flows.py
+++ b/src/routers/openml/flows.py
@@ -49,14 +49,8 @@ def get_flow(flow_id: int, expdb: Annotated[Connection, Depends(expdb_connection
]
tags = database.flows.get_tags(flow_id, expdb)
- flow_rows = database.flows.get_subflows(for_flow=flow_id, expdb=expdb)
- subflows = [
- {
- "identifier": flow.identifier,
- "flow": get_flow(flow_id=flow.child_id, expdb=expdb),
- }
- for flow in flow_rows
- ]
+ flow_rows = database.flows.get_subflows(flow_id, expdb)
+ subflows = [get_flow(flow_id=flow.child_id, expdb=expdb) for flow in flow_rows]
return Flow(
id_=flow.id,
diff --git a/src/routers/openml/tasks.py b/src/routers/openml/tasks.py
index 4fcb362..96d0198 100644
--- a/src/routers/openml/tasks.py
+++ b/src/routers/openml/tasks.py
@@ -7,6 +7,7 @@
from fastapi import APIRouter, Depends, HTTPException
from sqlalchemy import Connection, RowMapping, text
+import config
import database.datasets
import database.tasks
from routers.dependencies import expdb_connection
@@ -139,7 +140,8 @@ def _fill_json_template(
# I believe that the operations below are always part of string output, so
# we don't need to be careful to avoid losing typedness
template = template.replace("[TASK:id]", str(task.task_id))
- return template.replace("[CONSTANT:base_url]", "https://test.openml.org/")
+ server_url = config.load_routing_configuration()["server_url"]
+ return template.replace("[CONSTANT:base_url]", server_url)
@router.get("/{task_id}")
diff --git a/src/schemas/datasets/mldcat_ap.py b/src/schemas/datasets/mldcat_ap.py
index 9525431..d7e277f 100644
--- a/src/schemas/datasets/mldcat_ap.py
+++ b/src/schemas/datasets/mldcat_ap.py
@@ -275,7 +275,7 @@ class DataService(JsonLDObject):
class JsonLDGraph(BaseModel):
- context: str | dict[str, HttpUrl] = Field(default_factory=dict, serialization_alias="@context")
+ context: str | dict[str, HttpUrl] = Field(default_factory=dict, serialization_alias="@context") # type: ignore[arg-type]
graph: list[Distribution | DataService | Dataset | Quality | Feature | Agent | MD5Checksum] = (
Field(default_factory=list, serialization_alias="@graph")
)
diff --git a/src/schemas/flows.py b/src/schemas/flows.py
index a6cd479..33dc081 100644
--- a/src/schemas/flows.py
+++ b/src/schemas/flows.py
@@ -1,7 +1,7 @@
from __future__ import annotations
from datetime import datetime
-from typing import Any, TypedDict
+from typing import Any, Self
from pydantic import BaseModel, ConfigDict, Field
@@ -25,12 +25,7 @@ class Flow(BaseModel):
language: str | None = Field(max_length=128)
dependencies: str | None
parameter: list[Parameter]
- subflows: list[Subflow]
+ subflows: list[Self]
tag: list[str]
model_config = ConfigDict(arbitrary_types_allowed=True)
-
-
-class Subflow(TypedDict):
- identifier: str | None
- flow: Flow
diff --git a/tests/conftest.py b/tests/conftest.py
index 4d2c2c9..eecc128 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -16,6 +16,8 @@
from main import create_api
from routers.dependencies import expdb_connection, userdb_connection
+PHP_API_URL = "http://openml-php-rest-api:80/api/v1/json"
+
@contextlib.contextmanager
def automatic_rollback(engine: Engine) -> Iterator[Connection]:
@@ -39,8 +41,8 @@ def user_test() -> Connection:
@pytest.fixture
-def php_api() -> Iterator[httpx.Client]:
- with httpx.Client(base_url="http://server-api-php-api-1:80/api/v1/json") as client:
+def php_api() -> httpx.Client:
+ with httpx.Client(base_url=PHP_API_URL) as client:
yield client
diff --git a/tests/constants.py b/tests/constants.py
index e471fd5..6881f88 100644
--- a/tests/constants.py
+++ b/tests/constants.py
@@ -1,6 +1,6 @@
PRIVATE_DATASET_ID = {130}
-IN_PREPARATION_ID = {1, 33}
-DEACTIVATED_DATASETS = {2, 131}
+IN_PREPARATION_ID = {33}
+DEACTIVATED_DATASETS = {131}
DATASETS = set(range(1, 132))
NUMBER_OF_DATASETS = len(DATASETS)
diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py
index 7c1457f..b463d3d 100644
--- a/tests/routers/openml/datasets_test.py
+++ b/tests/routers/openml/datasets_test.py
@@ -8,6 +8,7 @@
from database.users import User
from routers.openml.datasets import get_dataset
from schemas.datasets.openml import DatasetMetadata, DatasetStatus
+from tests import constants
from tests.users import ADMIN_USER, NO_USER, OWNER_USER, SOME_USER, ApiKey
@@ -44,14 +45,14 @@ def test_get_dataset(py_api: TestClient) -> None:
"description_version": 1,
"upload_date": "2014-04-06T23:19:24",
"licence": "Public",
- "url": "https://test.openml.org/data/v1/download/1/anneal.arff",
- "parquet_url": "https://openml1.win.tue.nl/datasets/0000/0001/dataset_1.pq",
+ "url": "http://php-api/data/v1/download/1/anneal.arff",
+ "parquet_url": "http://minio:9000/datasets/0000/0001/dataset_1.pq",
"file_id": 1,
"default_target_attribute": ["class"],
"version_label": "1",
"tag": ["study_14"],
"visibility": "public",
- "status": "in_preparation",
+ "status": "active",
"processing_date": "2024-01-04T10:13:59",
"md5_checksum": "4eaed8b6ec9d8211024b6c089b064761",
"row_id_attribute": [],
@@ -222,7 +223,7 @@ def test_dataset_status_update_active_to_deactivated(dataset_id: int, py_api: Te
def test_dataset_status_update_in_preparation_to_active(py_api: TestClient) -> None:
_assert_status_update_is_successful(
apikey=ApiKey.ADMIN,
- dataset_id=1,
+ dataset_id=next(iter(constants.IN_PREPARATION_ID)),
status=DatasetStatus.ACTIVE,
py_api=py_api,
)
@@ -232,7 +233,7 @@ def test_dataset_status_update_in_preparation_to_active(py_api: TestClient) -> N
def test_dataset_status_update_in_preparation_to_deactivated(py_api: TestClient) -> None:
_assert_status_update_is_successful(
apikey=ApiKey.ADMIN,
- dataset_id=1,
+ dataset_id=next(iter(constants.IN_PREPARATION_ID)),
status=DatasetStatus.DEACTIVATED,
py_api=py_api,
)
@@ -242,7 +243,7 @@ def test_dataset_status_update_in_preparation_to_deactivated(py_api: TestClient)
def test_dataset_status_update_deactivated_to_active(py_api: TestClient) -> None:
_assert_status_update_is_successful(
apikey=ApiKey.ADMIN,
- dataset_id=131,
+ dataset_id=next(iter(constants.DEACTIVATED_DATASETS)),
status=DatasetStatus.ACTIVE,
py_api=py_api,
)
diff --git a/tests/routers/openml/flows_test.py b/tests/routers/openml/flows_test.py
index 2bf9fc3..611e478 100644
--- a/tests/routers/openml/flows_test.py
+++ b/tests/routers/openml/flows_test.py
@@ -221,148 +221,141 @@ def test_get_flow_with_subflow(py_api: TestClient) -> None:
],
"subflows": [
{
- "identifier": None,
- "flow": {
- "id": 4,
- "uploader": 16,
- "name": "weka.J48",
- "class_name": "weka.classifiers.trees.J48",
- "version": 1,
- "external_version": "Weka_3.9.0_11194",
- "description": (
- "Ross Quinlan (1993). C4.5: Programs for Machine Learning. "
- "Morgan Kaufmann Publishers, San Mateo, CA."
- ),
- "upload_date": "2017-03-24T14:26:40",
- "language": "English",
- "dependencies": "Weka_3.9.0",
- "parameter": [
- {
- "name": "-do-not-check-capabilities",
- "data_type": "flag",
- "default_value": None,
- "description": (
- "If set, classifier capabilities are not checked"
- " before classifier is built\n\t(use with caution)."
- ),
- },
- {
- "name": "-doNotMakeSplitPointActualValue",
- "data_type": "flag",
- "default_value": None,
- "description": "Do not make split point actual value.",
- },
- {
- "name": "A",
- "data_type": "flag",
- "default_value": None,
- "description": "Laplace smoothing for predicted probabilities.",
- },
- {
- "name": "B",
- "data_type": "flag",
- "default_value": None,
- "description": "Use binary splits only.",
- },
- {
- "name": "C",
- "data_type": "option",
- "default_value": 0.25,
- "description": (
- "Set confidence threshold for pruning.\n\t(default 0.25)"
- ),
- },
- {
- "name": "J",
- "data_type": "flag",
- "default_value": None,
- "description": (
- "Do not use MDL correction for info gain on numeric attributes."
- ),
- },
- {
- "name": "L",
- "data_type": "flag",
- "default_value": None,
- "description": "Do not clean up after the tree has been built.",
- },
- {
- "name": "M",
- "data_type": "option",
- "default_value": 2,
- "description": (
- "Set minimum number of instances per leaf.\n\t(default 2)"
- ),
- },
- {
- "name": "N",
- "data_type": "option",
- "default_value": None,
- "description": (
- "Set number of folds for reduced error\n\t"
- "pruning. One fold is used as pruning set.\n\t(default 3)"
- ),
- },
- {
- "name": "O",
- "data_type": "flag",
- "default_value": None,
- "description": "Do not collapse tree.",
- },
- {
- "name": "Q",
- "data_type": "option",
- "default_value": None,
- "description": "Seed for random data shuffling (default 1).",
- },
- {
- "name": "R",
- "data_type": "flag",
- "default_value": None,
- "description": "Use reduced error pruning.",
- },
- {
- "name": "S",
- "data_type": "flag",
- "default_value": None,
- "description": "Do not perform subtree raising.",
- },
- {
- "name": "U",
- "data_type": "flag",
- "default_value": None,
- "description": "Use unpruned tree.",
- },
- {
- "name": "batch-size",
- "data_type": "option",
- "default_value": None,
- "description": (
- "The desired batch size for batch prediction (default 100)."
- ),
- },
- {
- "name": "num-decimal-places",
- "data_type": "option",
- "default_value": None,
- "description": (
- "The number of decimal places for the output of numbers"
- " in the model (default 2)."
- ),
- },
- {
- "name": "output-debug-info",
- "data_type": "flag",
- "default_value": None,
- "description": (
- "If set, classifier is run in debug mode and\n\t"
- "may output additional info to the console"
- ),
- },
- ],
- "tag": ["OpenmlWeka", "weka"],
- "subflows": [],
- },
+ "id": 4,
+ "uploader": 16,
+ "name": "weka.J48",
+ "class_name": "weka.classifiers.trees.J48",
+ "version": 1,
+ "external_version": "Weka_3.9.0_11194",
+ "description": (
+ "Ross Quinlan (1993). C4.5: Programs for Machine Learning. "
+ "Morgan Kaufmann Publishers, San Mateo, CA."
+ ),
+ "upload_date": "2017-03-24T14:26:40",
+ "language": "English",
+ "dependencies": "Weka_3.9.0",
+ "parameter": [
+ {
+ "name": "-do-not-check-capabilities",
+ "data_type": "flag",
+ "default_value": None,
+ "description": (
+ "If set, classifier capabilities are not checked"
+ " before classifier is built\n\t(use with caution)."
+ ),
+ },
+ {
+ "name": "-doNotMakeSplitPointActualValue",
+ "data_type": "flag",
+ "default_value": None,
+ "description": "Do not make split point actual value.",
+ },
+ {
+ "name": "A",
+ "data_type": "flag",
+ "default_value": None,
+ "description": "Laplace smoothing for predicted probabilities.",
+ },
+ {
+ "name": "B",
+ "data_type": "flag",
+ "default_value": None,
+ "description": "Use binary splits only.",
+ },
+ {
+ "name": "C",
+ "data_type": "option",
+ "default_value": 0.25,
+ "description": ("Set confidence threshold for pruning.\n\t(default 0.25)"),
+ },
+ {
+ "name": "J",
+ "data_type": "flag",
+ "default_value": None,
+ "description": (
+ "Do not use MDL correction for info gain on numeric attributes."
+ ),
+ },
+ {
+ "name": "L",
+ "data_type": "flag",
+ "default_value": None,
+ "description": "Do not clean up after the tree has been built.",
+ },
+ {
+ "name": "M",
+ "data_type": "option",
+ "default_value": 2,
+ "description": ("Set minimum number of instances per leaf.\n\t(default 2)"),
+ },
+ {
+ "name": "N",
+ "data_type": "option",
+ "default_value": None,
+ "description": (
+ "Set number of folds for reduced error\n\t"
+ "pruning. One fold is used as pruning set.\n\t(default 3)"
+ ),
+ },
+ {
+ "name": "O",
+ "data_type": "flag",
+ "default_value": None,
+ "description": "Do not collapse tree.",
+ },
+ {
+ "name": "Q",
+ "data_type": "option",
+ "default_value": None,
+ "description": "Seed for random data shuffling (default 1).",
+ },
+ {
+ "name": "R",
+ "data_type": "flag",
+ "default_value": None,
+ "description": "Use reduced error pruning.",
+ },
+ {
+ "name": "S",
+ "data_type": "flag",
+ "default_value": None,
+ "description": "Do not perform subtree raising.",
+ },
+ {
+ "name": "U",
+ "data_type": "flag",
+ "default_value": None,
+ "description": "Use unpruned tree.",
+ },
+ {
+ "name": "batch-size",
+ "data_type": "option",
+ "default_value": None,
+ "description": (
+ "The desired batch size for batch prediction (default 100)."
+ ),
+ },
+ {
+ "name": "num-decimal-places",
+ "data_type": "option",
+ "default_value": None,
+ "description": (
+ "The number of decimal places for the output of numbers"
+ " in the model (default 2)."
+ ),
+ },
+ {
+ "name": "output-debug-info",
+ "data_type": "flag",
+ "default_value": None,
+ "description": (
+ "If set, classifier is run in debug mode and\n\t"
+ "may output additional info to the console"
+ ),
+ },
+ ],
+ "tag": ["OpenmlWeka", "weka"],
+ "subflows": [],
},
],
"tag": ["OpenmlWeka", "weka"],
diff --git a/tests/routers/openml/migration/datasets_migration_test.py b/tests/routers/openml/migration/datasets_migration_test.py
index 3faca11..812bde7 100644
--- a/tests/routers/openml/migration/datasets_migration_test.py
+++ b/tests/routers/openml/migration/datasets_migration_test.py
@@ -1,11 +1,11 @@
import json
from http import HTTPStatus
-import constants
import httpx
import pytest
from starlette.testclient import TestClient
+import tests.constants
from core.conversions import nested_remove_single_element_list
from tests.users import ApiKey
@@ -43,6 +43,9 @@ def test_dataset_response_is_identical( # noqa: C901, PLR0912
# The new API has normalized `format` field:
original_json["format"] = original_json["format"].lower()
+ # Pydantic HttpURL serialization omits port 80 for HTTP urls.
+ original_json["url"] = original_json["url"].replace(":80", "")
+
# There is odd behavior in the live server that I don't want to recreate:
# when the creator is a list of csv names, it can either be a str or a list
# depending on whether the names are quoted. E.g.:
@@ -127,7 +130,7 @@ def test_private_dataset_owner_access(
php_api: TestClient,
api_key: str,
) -> None:
- [private_dataset] = constants.PRIVATE_DATASET_ID
+ [private_dataset] = tests.constants.PRIVATE_DATASET_ID
new_response = py_api.get(f"/datasets/{private_dataset}?api_key={api_key}")
old_response = php_api.get(f"/data/{private_dataset}?api_key={api_key}")
assert old_response.status_code == HTTPStatus.OK
diff --git a/tests/routers/openml/migration/flows_migration_test.py b/tests/routers/openml/migration/flows_migration_test.py
index 674bc43..14d8088 100644
--- a/tests/routers/openml/migration/flows_migration_test.py
+++ b/tests/routers/openml/migration/flows_migration_test.py
@@ -65,9 +65,7 @@ def convert_flow_naming_and_defaults(flow: dict[str, Any]) -> dict[str, Any]:
if parameter["default_value"] is None:
parameter["default_value"] = []
for subflow in flow["subflows"]:
- subflow["flow"] = convert_flow_naming_and_defaults(subflow["flow"])
- if subflow["identifier"] is None:
- subflow["identifier"] = []
+ convert_flow_naming_and_defaults(subflow)
flow["component"] = flow.pop("subflows")
if flow["component"] == []:
flow.pop("component")
@@ -77,6 +75,8 @@ def convert_flow_naming_and_defaults(flow: dict[str, Any]) -> dict[str, Any]:
new = nested_remove_single_element_list(new)
expected = php_api.get(f"/flow/{flow_id}").json()["flow"]
+ if subflow := expected.get("component"):
+ expected["component"] = subflow["flow"]
# The reason we don't transform "new" to str is that it becomes harder to ignore numeric type
# differences (e.g., '1.0' vs '1')
expected = nested_str_to_num(expected)
diff --git a/tests/routers/openml/task_test.py b/tests/routers/openml/task_test.py
index 89fc316..d635fdf 100644
--- a/tests/routers/openml/task_test.py
+++ b/tests/routers/openml/task_test.py
@@ -19,7 +19,7 @@ def test_get_task(py_api: TestClient) -> None:
"estimation_procedure": {
"id": 5,
"type": "holdout",
- "data_splits_url": "https://test.openml.org/api_splits/get/59/Task_59_splits.arff",
+ "data_splits_url": "http://php-api:80/api_splits/get/59/Task_59_splits.arff",
"parameter": [
{"name": "number_repeats", "value": 1},
{"name": "number_folds", "value": None},