openml · PGijsbers · Dec 19, 2025 · Dec 9, 2025 · Dec 10, 2025 · Dec 10, 2025
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -15,37 +15,29 @@ jobs:
   compare-php:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v4
+      - uses: actions/checkout@v6
+      - uses: actions/setup-python@v6
         with:
           python-version: 3.x
 
-      # A naive `docker compose up` would first build the `python-api` container and then
-      # start all services, which kickstarts Elastic Search and building indices.
-      # But since those two steps are independent, we can parallelize them to save time.
-      - run: |
-          docker compose build python-api
-          docker compose up -d --wait python-api php-api
-      - run: docker container ls && docker image ls
-      - run: docker exec python-api python -m pip freeze
-      - run: docker exec python-api coverage run -m pytest -xv -m "php_api"
-      - run: docker exec python-api coverage xml
+      # https://github.com/docker/compose/issues/10596
+      - run: docker compose --profile "python" --profile "php" up --detach --wait --remove-orphans || exit $(docker compose ps -q | xargs docker inspect -f '{{.State.ExitCode}}' | grep -v '^0' | wc -l)
+      - run: docker exec openml-python-rest-api coverage run -m pytest -v -m "php_api"
+      - run: docker exec openml-python-rest-api coverage xml
       - name: Upload results to Codecov
         uses: codecov/codecov-action@v4
         with:
           token: ${{ secrets.CODECOV_TOKEN }}
   python:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v4
+      - uses: actions/checkout@v6
+      - uses: actions/setup-python@v6
         with:
           python-version: 3.x
-      - run: docker compose up -d --wait database python-api
-      - run: docker container ls && docker image ls
-      - run: docker exec python-api python -m pip freeze
-      - run: docker exec python-api coverage run -m pytest -xv -m "not php_api"
-      - run: docker exec python-api coverage xml
+      - run: docker compose --profile "python" up --detach --wait --remove-orphans || exit $(docker compose ps -q | xargs docker inspect -f '{{.State.ExitCode}}' | grep -v '^0' | wc -l)
+      - run: docker exec openml-python-rest-api coverage run -m pytest -v -m "not php_api"
+      - run: docker exec openml-python-rest-api coverage xml
       - name: Upload results to Codecov
         uses: codecov/codecov-action@v4
         with:

diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 docker/mysql/data
+.DS_Store
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -1,13 +1,33 @@
 services:
   database:
-    image: "openml/test-database"
+    profiles: ["python", "php", "all"]
+    image: "openml/test-database:20240105"
     container_name: "openml-test-database"
     environment:
       MYSQL_ROOT_PASSWORD: ok
     ports:
       - "3306:3306"
+    healthcheck:
+      test: ["CMD", "mysqladmin" ,"ping", "-h", "localhost"]
+      start_period: 30s
+      start_interval: 1s
+      timeout: 3s
+      interval: 5s
+      retries: 10
+
+  database-setup:
+    profiles: ["python", "php", "all"]
+    image: mysql
+    container_name: "openml-test-database-setup"
+    volumes:
+      - ./docker/database/update.sh:/database-update.sh
+    command: /bin/sh -c "/database-update.sh"
+    depends_on:
+      database:
+        condition: service_healthy
 
   docs:
+    profiles: ["all"]
     build:
       context: .
       dockerfile: docker/docs/Dockerfile
@@ -16,8 +36,35 @@ services:
     volumes:
       - .:/docs
 
+  elasticsearch:
+    profiles: ["php", "all"]
+    image: docker.elastic.co/elasticsearch/elasticsearch:6.8.23
+    container_name: "openml-elasticsearch"
+    platform: "linux/amd64"
+    ports:
+      - "9200:9200"  # also known as /es (nginx)
+      - "9300:9300"
+    env_file: docker/elasticsearch/.env
+    healthcheck:
+      test: curl 127.0.0.1:9200/_cluster/health | grep -e "green"
+      start_period: 30s
+      start_interval: 5s
+      timeout: 3s
+      interval: 10s
-    healthcheck:
-      test: curl 127.0.0.1:9200/_cluster/health | grep -e "green"
-      start_period: 30s
-      start_interval: 5s
-      timeout: 3s
-      interval: 10s
+    healthcheck:
+      test: curl -s 127.0.0.1:9200/_cluster/health | grep -E '"status":"(green|yellow)"'
+      start_period: 30s
+      start_interval: 5s
+      timeout: 3s
+      interval: 10s
-    healthcheck:
-      test: curl 127.0.0.1:9200/_cluster/health | grep -e "green"
-      start_period: 30s
-      start_interval: 5s
-      timeout: 3s
-      interval: 10s
+    healthcheck:
+      test: curl -s 127.0.0.1:9200/_cluster/health | grep -E '"status":"(green|yellow)"'
+      start_period: 30s
+      start_interval: 5s
+      timeout: 3s
+      interval: 10s
+    deploy:
+      resources:
+        limits:
+          cpus: '1'
+          memory: 1G
+        reservations:
+          cpus: '0.2'
+          memory: 250M
+
   php-api:
-    image: "openml/php-rest-api"
+    profiles: ["php", "all"]
+    image: "openml/php-rest-api:v1.2.2"
+    container_name: "openml-php-rest-api"
+    env_file: docker/php/.env
     ports:
       - "8002:80"
     depends_on:
@@ -33,7 +80,8 @@ services:
       interval: 1m
 
   python-api:
-    container_name: "python-api"
+    profiles: ["python", "all"]
+    container_name: "openml-python-rest-api"
     build:
       context: .
       dockerfile: docker/python/Dockerfile
@@ -43,20 +91,3 @@ services:
       - .:/python-api
     depends_on:
       - database
-
-  elasticsearch:
-    image: docker.elastic.co/elasticsearch/elasticsearch:6.8.23
-    container_name: "elasticsearch"
-    ports:
-      - "9200:9200"
-      - "9300:9300"
-    environment:
-      - ELASTIC_PASSWORD=default
-      - discovery.type=single-node
-      - xpack.security.enabled=false
-    healthcheck:
-      test: curl 127.0.0.1:9200/_cluster/health | grep -e "green"
-      start_period: 30s
-      start_interval: 5s
-      timeout: 3s
-      interval: 1m
diff --git a/docker/database/update.sh b/docker/database/update.sh
@@ -0,0 +1,31 @@
+#/bin/bash
-#/bin/bash
+#!/bin/bash
-#/bin/bash
+#!/bin/bash
+# Change the filepath of openml.file
+# from "https://www.openml.org/data/download/1666876/phpFsFYVN"
+# to "http://minio:9000/datasets/0000/0001/phpFsFYVN"
+mysql -hdatabase -uroot -pok -e 'UPDATE openml.file SET filepath = CONCAT("http://minio:9000/datasets/0000/", LPAD(id, 4, "0"), "/", SUBSTRING_INDEX(filepath, "/", -1)) WHERE extension="arff";'
+
+# Update openml.expdb.dataset with the same url
+mysql -hdatabase -uroot -pok -e 'UPDATE openml_expdb.dataset DS, openml.file FL SET DS.url = FL.filepath WHERE DS.did = FL.id;'
+
+
+
+
+
+# Create the data_feature_description TABLE. TODO: can we make sure this table exists already?
+mysql -hdatabase -uroot -pok -Dopenml_expdb -e 'CREATE TABLE IF NOT EXISTS `data_feature_description` (
+  `did` int unsigned NOT NULL,
+  `index` int unsigned NOT NULL,
+  `uploader` mediumint unsigned NOT NULL,
+  `date` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
+  `description_type` enum("plain", "ontology") NOT NULL,
+  `value` varchar(256) NOT NULL,
+  KEY `did` (`did`,`index`),
+  CONSTRAINT `data_feature_description_ibfk_1` FOREIGN KEY (`did`, `index`) REFERENCES `data_feature` (`did`, `index`) ON DELETE CASCADE ON UPDATE CASCADE
+)'
+
+# SET dataset 1 to active (used in unittests java)
+mysql -hdatabase -uroot -pok -Dopenml_expdb -e 'INSERT IGNORE INTO dataset_status VALUES (1, "active", "2024-01-01 00:00:00", 1)'
+mysql -hdatabase -uroot -pok -Dopenml_expdb -e 'DELETE FROM dataset_status WHERE did = 2 AND status = "deactivated";'
+
+# Temporary fix in case the database missed the kaggle table. The PHP Rest API expects the table to be there, while indexing.
+mysql -hdatabase -uroot -pok -Dopenml_expdb -e 'CREATE TABLE IF NOT EXISTS `kaggle` (`dataset_id` int(11) DEFAULT NULL, `kaggle_link` varchar(500) DEFAULT NULL)'
diff --git a/docker/elasticsearch/.env b/docker/elasticsearch/.env
@@ -0,0 +1,3 @@
+ELASTIC_PASSWORD=default
+discovery.type=single-node
+xpack.security.enabled=false
diff --git a/docker/mysql/README.md b/docker/mysql/README.md
@@ -16,7 +16,7 @@ which sets:
 
 You should be able to connect to it using `mysql`:
 ```bash
-
+mysql --host 127.0.0.1 --port 3306 -uroot -pok
 ```
 If you do not have `mysql` installed, you may refer to the MySQL image documentation on
 how to use the image instead to connect over a docker network if you want to connect

diff --git a/docker/php/.env b/docker/php/.env
@@ -0,0 +1,14 @@
+API_KEY=AD000000000000000000000000000000
+BASE_URL=http://php-api:80/
+MINIO_URL=http://minio:9000/
+DB_HOST_OPENML=database:3306
+DB_HOST_EXPDB=database:3306
+DB_USER_OPENML=root
+DB_PASS_OPENML=ok
+DB_USER_EXPDB_WRITE=root
+DB_PASS_EXPDB_WRITE=ok
+DB_USER_EXPDB_READ=root
+DB_PASS_EXPDB_READ=ok
+ES_URL=elasticsearch:9200
+ES_PASSWORD=default
+INDEX_ES_DURING_STARTUP=false
diff --git a/docker/readme.md b/docker/readme.md
@@ -7,39 +7,14 @@ This directory contains the files and information to build the following 5 image
  - docs: the official [mkdocs-material](https://hub.docker.com/r/squidfunk/mkdocs-material)
     image but with additional plugins installed required for building the documentation
     in this project's `/doc` directory.
- - [openml/php-rest-api](https://hub.docker.com/r/openml/php-rest-api): image with the
-    php back-end code, but ran on [feature/elasticsearch8](https://github.com/openml/openml/tree/feature/elasticsearch8)
-    branch.
  - python-api: an image of this project, to facilitate development on any platform.
- - [openml/elasticsearch8-prebuilt](https://hub.docker.com/r/openml/elasticsearch8-prebuilt):
-    the default elasticsearch image, but with indices already built on the test database
-    through invocation of the old php code.
 
 Between the prebuilt indices and the baked-in database, when all images have already been
 pulled, a `docker compose up` step should only take seconds. 🚀
 
-## Building `openml/elasticsearch8-prebuilt`
-The `openml/elasticsearch8-prebuilt` is not made with a Dockerfile, because it requires
-steps of running containers, which to the best of my knowledge is not facilitated by
-docker (not even through [multi-stage builds](https://docs.docker.com/build/building/multi-stage/)).
-So, instead we build the container state locally and then use [`docker commit`](https://docs.docker.com/engine/reference/commandline/commit/).
-
-1. run `docker compose up`, but with the `elasticsearch` service pointing to
-    `docker.elastic.co/elasticsearch/elasticsearch:8.10.4` instead of `openml/elasticsearch8-prebuilt`.
-2. build the indices from the `php-api` container:
-
-   1. Connect to the container: `docker exec -it server-api-php-api-1 /bin/bash`
-   2. (optional) Edit `/var/www/openml/index.php` and set L56 to `development` instead of `production`,
-       this will show progress of building the indices, or print out any error that may occur.
-   3. Build the indices: `php /var/www/openml/index.php cron build_es_indices`
-   4. Exit the container with `exit`.
-
-3. Make a commit of the elastic search container with prebuilt indices: `docker commit elasticsearch openml/elasticsearch8-prebuilt`
-4. Push the image created by the commit: `docker push openml/elasticsearch8-prebuilt`
-
 ## Building for multiple platforms
 
-Following Docker's "[multi-platform images](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwiTutyczsOCAxUUhv0HHe_VA6QQFnoECBAQAQ&url=https%3A%2F%2Fdocs.docker.com%2Fbuild%2Fbuilding%2Fmulti-platform%2F&usg=AOvVaw0YP_mkj5WTYD-0weEfrfDv&opi=89978449)"
+Following Docker's "[multi-platform images](https://docs.docker.com/build/building/multi-platform/)"
 documentation, we can build multi-platform images in a few simple steps:
 
 1. Only the first time, create a docker-container driver: `docker buildx create --name container --driver=docker-container`

diff --git a/src/config.py b/src/config.py
@@ -8,6 +8,8 @@
 
 TomlTable = dict[str, typing.Any]
 
+CONFIG_PATH = Path(__file__).parent / "config.toml"
+
 
 def _apply_defaults_to_siblings(configuration: TomlTable) -> TomlTable:
     defaults = configuration["defaults"]
@@ -19,9 +21,17 @@ def _apply_defaults_to_siblings(configuration: TomlTable) -> TomlTable:
 
 
 @functools.cache
-def load_database_configuration(file: Path = Path(__file__).parent / "config.toml") -> TomlTable:
-    configuration = tomllib.loads(file.read_text())
+def _load_configuration(file: Path) -> TomlTable:
+    return typing.cast(TomlTable, tomllib.loads(file.read_text()))
+
+
+def load_routing_configuration(file: Path = CONFIG_PATH) -> TomlTable:
+    return typing.cast(TomlTable, _load_configuration(file)["routing"])
 
+
+@functools.cache
+def load_database_configuration(file: Path = CONFIG_PATH) -> TomlTable:
+    configuration = _load_configuration(file)
     database_configuration = _apply_defaults_to_siblings(
         configuration["databases"],
     )

diff --git a/src/config.toml b/src/config.toml
@@ -15,3 +15,7 @@ database="openml_expdb"
 
 [databases.openml]
 database="openml"
+
+[routing]
+minio_url="http://minio:9000/"
+server_url="http://php-api:80/"
diff --git a/src/core/formatting.py b/src/core/formatting.py
@@ -2,7 +2,7 @@
 
 from sqlalchemy.engine import Row
 
-from config import load_configuration
+from config import load_routing_configuration
 from core.errors import DatasetError
 from schemas.datasets.openml import DatasetFileFormat
 
@@ -25,15 +25,16 @@ def _format_parquet_url(dataset: Row) -> str | None:
     if dataset.format.lower() != DatasetFileFormat.ARFF:
         return None
 
-    minio_base_url = load_configuration()["minio_base_url"]
-    prefix = dataset.did // 10_000
-    return f"{minio_base_url}/datasets/{prefix:04d}/{dataset.did:04d}/dataset_{dataset.did}.pq"
+    minio_base_url = load_routing_configuration()["minio_url"]
+    ten_thousands_prefix = f"{dataset.did // 10_000:04d}"
+    padded_id = f"{dataset.did:04d}"
+    return f"{minio_base_url}datasets/{ten_thousands_prefix}/{padded_id}/dataset_{dataset.did}.pq"
 
 
 def _format_dataset_url(dataset: Row) -> str:
-    base_url = load_configuration()["arff_base_url"]
+    base_url = load_routing_configuration()["server_url"]
     filename = f"{html.escape(dataset.name)}.{dataset.format.lower()}"
-    return f"{base_url}/data/v1/download/{dataset.file_id}/{filename}"
+    return f"{base_url}data/v1/download/{dataset.file_id}/{filename}"
 
 
 def _safe_unquote(text: str | None) -> str | None:

diff --git a/src/main.py b/src/main.py
@@ -21,18 +21,18 @@ def _parse_args() -> argparse.Namespace:
         "uvicorn",
         "arguments forwarded to uvicorn",
     )
-    uvicorn_options.add_argument(
+    _ = uvicorn_options.add_argument(
         "--reload",
         action="store_true",
         help="Enable auto-reload",
     )
-    uvicorn_options.add_argument(
+    _ = uvicorn_options.add_argument(
         "--host",
         default="127.0.0.1",
         type=str,
         help="Bind socket to this host.",
     )
-    uvicorn_options.add_argument(
+    _ = uvicorn_options.add_argument(
         "--port",
         default=8000,
         type=int,

diff --git a/src/routers/openml/flows.py b/src/routers/openml/flows.py
@@ -49,14 +49,8 @@ def get_flow(flow_id: int, expdb: Annotated[Connection, Depends(expdb_connection
     ]
 
     tags = database.flows.get_tags(flow_id, expdb)
-    flow_rows = database.flows.get_subflows(for_flow=flow_id, expdb=expdb)
-    subflows = [
-        {
-            "identifier": flow.identifier,
-            "flow": get_flow(flow_id=flow.child_id, expdb=expdb),
-        }
-        for flow in flow_rows
-    ]
+    flow_rows = database.flows.get_subflows(flow_id, expdb)
+    subflows = [get_flow(flow_id=flow.child_id, expdb=expdb) for flow in flow_rows]
 
     return Flow(
         id_=flow.id,

diff --git a/src/routers/openml/tasks.py b/src/routers/openml/tasks.py
@@ -7,6 +7,7 @@
 from fastapi import APIRouter, Depends, HTTPException
 from sqlalchemy import Connection, RowMapping, text
 
+import config
 import database.datasets
 import database.tasks
 from routers.dependencies import expdb_connection
@@ -139,7 +140,8 @@ def _fill_json_template(
     # I believe that the operations below are always part of string output, so
     # we don't need to be careful to avoid losing typedness
     template = template.replace("[TASK:id]", str(task.task_id))
-    return template.replace("[CONSTANT:base_url]", "https://test.openml.org/")
+    server_url = config.load_routing_configuration()["server_url"]
+    return template.replace("[CONSTANT:base_url]", server_url)
 
 
 @router.get("/{task_id}")

diff --git a/src/schemas/datasets/mldcat_ap.py b/src/schemas/datasets/mldcat_ap.py
@@ -275,7 +275,7 @@ class DataService(JsonLDObject):
 
 
 class JsonLDGraph(BaseModel):
-    context: str | dict[str, HttpUrl] = Field(default_factory=dict, serialization_alias="@context")
+    context: str | dict[str, HttpUrl] = Field(default_factory=dict, serialization_alias="@context")  # type: ignore[arg-type]
     graph: list[Distribution | DataService | Dataset | Quality | Feature | Agent | MD5Checksum] = (
         Field(default_factory=list, serialization_alias="@graph")
     )