diff --git a/.gitignore b/.gitignore index b6e4761..0a10c95 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,4 @@ dmypy.json # Pyre type checker .pyre/ +pyrightconfig.json diff --git a/Generate ecoinvent transitive mapping.ipynb b/Generate ecoinvent transitive mapping.ipynb new file mode 100644 index 0000000..bb14887 --- /dev/null +++ b/Generate ecoinvent transitive mapping.ipynb @@ -0,0 +1,619 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "d8e9c0b3-dfa9-46cc-b973-5fe953f38521", + "metadata": {}, + "outputs": [], + "source": [ + "import randonneur_data as rd\n", + "import randonneur as rn\n", + "from flowmapper.extraction.ecospold2 import remove_conflicting_synonyms, reformat\n", + "from pathlib import Path\n", + "import xmltodict\n", + "import structlog\n", + "import logging\n", + "from pathlib import Path\n", + "from tqdm import tqdm\n", + "from copy import deepcopy\n", + "from collections import defaultdict" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "303ee863-eabf-42ff-bef7-cb09e654bc7e", + "metadata": {}, + "outputs": [], + "source": [ + "logging.config.dictConfig({\n", + " \"version\": 1,\n", + " \"disable_existing_loggers\": False,\n", + " \"handlers\": {\n", + " \"file\": {\n", + " \"level\": \"DEBUG\",\n", + " \"class\": \"logging.handlers.WatchedFileHandler\",\n", + " \"filename\": \"test.log\",\n", + " },\n", + " },\n", + " \"loggers\": {\n", + " \"\": {\n", + " \"handlers\": [\"file\"],\n", + " \"level\": \"DEBUG\",\n", + " \"propagate\": True,\n", + " },\n", + " }\n", + "})\n", + "structlog.configure(\n", + " processors=[\n", + " structlog.stdlib.filter_by_level,\n", + " structlog.stdlib.add_logger_name,\n", + " structlog.stdlib.add_log_level,\n", + " structlog.stdlib.PositionalArgumentsFormatter(),\n", + " structlog.processors.TimeStamper(fmt=\"iso\"),\n", + " structlog.processors.StackInfoRenderer(),\n", + " structlog.processors.format_exc_info,\n", + " structlog.processors.UnicodeDecoder(),\n", + " structlog.processors.JSONRenderer(),\n", + " structlog.stdlib.ProcessorFormatter.wrap_for_formatter,\n", + " ],\n", + " logger_factory=structlog.stdlib.LoggerFactory(),\n", + " wrapper_class=structlog.stdlib.BoundLogger,\n", + " cache_logger_on_first_use=True,\n", + ")\n", + "logger = structlog.get_logger(\"ecoinvent-migrate\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e80afdee-7e95-4cb4-8c9b-eb85aed4312e", + "metadata": {}, + "outputs": [], + "source": [ + "registry = rd.Registry()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6890a48b-9a26-4365-8e3d-1d9bcfaa69f5", + "metadata": {}, + "outputs": [], + "source": [ + "def get_elem_flow_data(version: str) -> list[dict]:\n", + " path = Path(f\"/Users/cmutel/Library/Application Support/EcoinventInterface/cache/ecoinvent {version}_cutoff_ecoSpold02/MasterData/ElementaryExchanges.xml\")\n", + "\n", + " if not path.is_file():\n", + " path = Path(f\"/Users/cmutel/Library/Application Support/EcoinventInterface/cache/ecoinvent {version}_cut-off_ecoSpold02/MasterData/ElementaryExchanges.xml\")\n", + " \n", + " with open(path) as fs:\n", + " ei_xml = xmltodict.parse(fs.read(), strip_whitespace=False)[\n", + " \"validElementaryExchanges\"\n", + " ][\"elementaryExchange\"]\n", + "\n", + " data = remove_conflicting_synonyms([reformat(obj) for obj in ei_xml])\n", + "\n", + " for obj in data:\n", + " if \"formula\" in obj:\n", + " del obj[\"formula\"]\n", + " if \"cas_number\" in obj and not obj[\"cas_number\"]:\n", + " del obj[\"cas_number\"]\n", + "\n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c6617ef6-54d4-4c9d-bb98-c4a7dfed81a3", + "metadata": {}, + "outputs": [], + "source": [ + "def get_elem_flow_dict(version: str) -> dict:\n", + " return {row['identifier']: row for row in get_elem_flow_data(version)}" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "35fd877d-ebe2-45be-9381-d49fdcfa067a", + "metadata": {}, + "outputs": [], + "source": [ + "def add_comment(comment: str | None, addition: str, deletions: list[str] = [\"replaced\"]) -> str:\n", + " if comment is None:\n", + " comment = \"\"\n", + " \n", + " for deletion in deletions:\n", + " if comment == deletion:\n", + " comment = \"\"\n", + "\n", + " if comment and not comment.endswith(\".\"):\n", + " comment += \".\"\n", + "\n", + " if comment:\n", + " return comment + \" \" + addition\n", + " else:\n", + " return addition" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "49971a58-c14f-4480-86c5-698416308380", + "metadata": {}, + "outputs": [], + "source": [ + "def generate_changes(\n", + " source_version: str, \n", + " target_version: str\n", + ") -> list[dict]:\n", + " \"\"\"Use the `uuid` in the change list to add in other attributes not given in change list.\"\"\"\n", + " source_flow_dict = get_elem_flow_dict(source_version)\n", + " target_flow_dict = get_elem_flow_dict(target_version)\n", + "\n", + " changes = []\n", + "\n", + " for s_key, s_data in source_flow_dict.items():\n", + " if s_key not in target_flow_dict:\n", + " logger.debug(\"Elementary flow deleted: %s\", s_data)\n", + " continue\n", + "\n", + " t_data = target_flow_dict[s_key]\n", + " \n", + " if t_data == s_data:\n", + " continue\n", + "\n", + " attributes = \", \".join([\n", + " key \n", + " for key, value in t_data.items() \n", + " if key in s_data \n", + " and s_data[key] != value\n", + " ])\n", + " change = {\n", + " \"source\": s_data,\n", + " \"target\": t_data,\n", + " \"comment\": f\"Changed {attributes} from {source_version} to {target_version}.\",\n", + " \"source_version\": f\"ecoinvent-{source_version}-biosphere\",\n", + " \"target_version\": f\"ecoinvent-{target_version}-biosphere\"\n", + " }\n", + " changes.append(change)\n", + "\n", + " return changes " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "fe2a20c7-88ea-4019-a657-67110684532c", + "metadata": {}, + "outputs": [], + "source": [ + "def supplement_changes(\n", + " changes: list[dict], \n", + " source_version: str, \n", + " target_version: str\n", + ") -> list[dict]:\n", + " \"\"\"Use the `uuid` in the change list to add in other attributes not given in change list.\"\"\"\n", + " source_flow_dict = get_elem_flow_dict(source_version)\n", + " target_flow_dict = get_elem_flow_dict(target_version)\n", + " \n", + " for change in changes:\n", + " if \"formula\" in change[\"source\"]:\n", + " del change[\"source\"][\"formula\"]\n", + " if \"formula\" in change[\"target\"]:\n", + " del change[\"target\"][\"formula\"]\n", + "\n", + " \n", + " change['source'].update(source_flow_dict[change['source']['uuid']])\n", + " del change['source']['uuid']\n", + " change['target'].update(target_flow_dict[change['target']['uuid']])\n", + " del change['target']['uuid']\n", + "\n", + " attributes = \", \".join([\n", + " key \n", + " for key, value in change['target'].items() \n", + " if key in change['source'] \n", + " and change['source'][key] != value\n", + " ])\n", + " comment = add_comment(\n", + " change.get(\"comment\"),\n", + " f\"Changed {attributes} from {source_version} to {target_version}.\"\n", + " )\n", + " change['comment'] = comment\n", + " change[\"source_version\"] = f\"ecoinvent-{source_version}-biosphere\"\n", + " change[\"target_version\"] = f\"ecoinvent-{target_version}-biosphere\"\n", + "\n", + " return changes" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a7ba6cba-eedb-4502-befc-c4282e9b71a0", + "metadata": {}, + "outputs": [], + "source": [ + "def get_filtered_rd_changes(from_v: str, to_v: str) -> list[dict]:\n", + " \"\"\"Return a filtered list of biosphere changes where the name or uuid changed\"\"\"\n", + " raw = registry.get_file(f'ecoinvent-{from_v}-biosphere-ecoinvent-{to_v}-biosphere')\n", + " if 'replace' in raw:\n", + " data = raw['replace']\n", + " elif 'update' in raw:\n", + " data = raw['update']\n", + " else:\n", + " print(\"No update changes found\")\n", + " return []\n", + " data = [\n", + " obj\n", + " for obj in data\n", + " if 'name' in obj['target']\n", + " or obj['target']['uuid'] != obj['source']['uuid']\n", + " ]\n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "29185ef0-1fea-4b21-a4bd-3c8e18d2bc42", + "metadata": {}, + "outputs": [], + "source": [ + "def remove_only_synonyms_change(data: list[dict]) -> list[dict]:\n", + " good = []\n", + "\n", + " for line in data:\n", + " source = {k: v for k, v in line['source'].items() if k != \"synonyms\"}\n", + " target = {k: v for k, v in line['target'].items() if k != \"synonyms\"}\n", + " if source != target:\n", + " good.append(line)\n", + "\n", + " return good" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "b900b65d-6af7-4e86-8efd-91675942e248", + "metadata": {}, + "outputs": [], + "source": [ + "def apply_forward_change(data: list[dict], other: list[dict]) -> list[dict]:\n", + " \"\"\"Apply additional changes to get transitive change set.\"\"\"\n", + " other_mapping = {obj['source']['identifier']: obj for obj in other}\n", + " \n", + " for obj in data:\n", + " try:\n", + " transitive = other_mapping[obj['target']['identifier']]\n", + " obj['target'] = transitive['target']\n", + " if transitive.get(\"comment\"):\n", + " obj['comment'] = add_comment(obj.get(\"comment\"), addition=transitive[\"comment\"])\n", + " obj[\"target_version\"] = transitive[\"target_version\"]\n", + "\n", + " if \"conversion_factor\" in transitive:\n", + " obj[\"conversion_factor\"] = obj.get(\"conversion_factor\", 1.) * transitive[\"conversion_factor\"]\n", + " \n", + " logger.debug(\"Mapping change: %s\", obj)\n", + " except KeyError:\n", + " continue\n", + "\n", + " input_uuids = {obj['source'].get('identifier', None) for obj in data}\n", + " extra = [obj for obj in other if obj['source']['identifier'] not in input_uuids]\n", + " \n", + " return data + remove_only_synonyms_change(extra)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "34d4eb99-3dcf-4f0f-9121-c93b68ce0c85", + "metadata": {}, + "outputs": [], + "source": [ + "def generate_transitive_datapackage(data: list[dict], source_id: str, end_target: str) -> rn.Datapackage:\n", + " dp = rn.Datapackage(\n", + " name=f\"ecoinvent-{source_id}-biosphere-ecoinvent-{end_target}-biosphere-transitive\",\n", + " source_id=f\"ecoinvent-{source_id}-biosphere\",\n", + " target_id=f\"ecoinvent-{end_target}-biosphere\",\n", + " description=f\"Transitive ecoinvent elementary flow correspondence from {source_id} to {end_target}\",\n", + " contributors=[{\"title\": \"Chris Mutel\", \"roles\": [\"author\"], \"path\": \"https://chris.mutel.org\"}],\n", + " mapping_source=rn.MappingConstants.ECOSPOLD2_BIO_FLOWMAPPER,\n", + " mapping_target=rn.MappingConstants.ECOSPOLD2_BIO_FLOWMAPPER,\n", + " version=\"1.0\",\n", + " )\n", + " dp.add_data(verb=\"update\", data=data)\n", + " filename = f\"ecoinvent-{source_id}-biosphere-ecoinvent-{end_target}-biosphere-transitive.json\"\n", + " dp.to_json(filename)\n", + " registry.add_file(filename, replace=True)\n", + " return dp" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "9b5d3432-dc49-4089-a416-94a40f070de7", + "metadata": {}, + "outputs": [], + "source": [ + "def generate_datapackage(data: list[dict], source_id: str, target_id: str) -> rn.Datapackage:\n", + " dp = rn.Datapackage(\n", + " name=f\"ecoinvent-{source_id}-biosphere-ecoinvent-{target_id}-biosphere\",\n", + " source_id=f\"ecoinvent-{source_id}-biosphere\",\n", + " target_id=f\"ecoinvent-{target_id}-biosphere\",\n", + " description=f\"ecoinvent elementary flow correspondence from {source_id} to {target_id}\",\n", + " contributors=[{\"title\": \"Chris Mutel\", \"roles\": [\"author\"], \"path\": \"https://chris.mutel.org\"}],\n", + " mapping_source=rn.MappingConstants.ECOSPOLD2_BIO_FLOWMAPPER,\n", + " mapping_target=rn.MappingConstants.ECOSPOLD2_BIO_FLOWMAPPER,\n", + " version=\"1.0\",\n", + " )\n", + " dp.add_data(verb=\"update\", data=data)\n", + " filename = f\"ecoinvent-{source_id}-biosphere-ecoinvent-{target_id}-biosphere.json\"\n", + " dp.to_json(filename)\n", + " registry.add_file(filename, replace=True)\n", + " return dp" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "46265815-4d3b-442a-ad19-37c98d94d04d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Flowmapper-standard-units-harmonization',\n", + " 'SimaPro-2025-ecoinvent-3.12-context',\n", + " 'SimaPro-9-ecoinvent-3.8-biosphere',\n", + " 'SimaPro-9-ecoinvent-3.8-biosphere-manual-matches',\n", + " 'SimaPro-9-ecoinvent-3.9-biosphere',\n", + " 'SimaPro-9-ecoinvent-3.9-biosphere-manual-matches',\n", + " 'agribalyse-3.1.1-biosphere-ecoinvent-3.8-biosphere',\n", + " 'agribalyse-3.1.1-delete-aggregated-ecoinvent',\n", + " 'agribalyse-3.1.1-ecoinvent-3.10-biosphere-manual-matches',\n", + " 'agribalyse-3.1.1-restore-simapro-ecoinvent-names',\n", + " 'agrifootprint-2022-delete-aggregated-ecoinvent',\n", + " 'agrifootprint-2022-ecoinvent-3.10-biosphere',\n", + " 'agrifootprint-2022-ecoinvent-3.8-biosphere',\n", + " 'agrifootprint-2022-restore-simapro-ecoinvent-names',\n", + " 'ecoinvent-2.2-biosphere-context-ecoinvent-3.0-biosphere-context',\n", + " 'ecoinvent-2.2-biosphere-ecoinvent-3.0-biosphere',\n", + " 'ecoinvent-2.2-biosphere-ecoinvent-3.12-biosphere-transitive',\n", + " 'ecoinvent-3.01-biosphere-ecoinvent-3.1-biosphere',\n", + " 'ecoinvent-3.01-biosphere-ecoinvent-3.12-biosphere-transitive',\n", + " 'ecoinvent-3.1-biosphere-ecoinvent-3.12-biosphere-transitive',\n", + " 'ecoinvent-3.10.1-biosphere-EF-3.1-biosphere',\n", + " 'ecoinvent-3.10.1-biosphere-ecoinvent-3.11-biosphere',\n", + " 'ecoinvent-3.10.1-biosphere-ecoinvent-3.12-biosphere-transitive',\n", + " 'ecoinvent-3.10.1-cutoff-ecoinvent-3.11-cutoff',\n", + " 'ecoinvent-3.11-biosphere-EF-3.1-biosphere',\n", + " 'ecoinvent-3.11-cutoff-ecoinvent-3.12-cutoff',\n", + " 'ecoinvent-3.2-biosphere-ecoinvent-3.12-biosphere-transitive',\n", + " 'ecoinvent-3.3-biosphere-ecoinvent-3.12-biosphere-transitive',\n", + " 'ecoinvent-3.3-biosphere-ecoinvent-3.4-biosphere',\n", + " 'ecoinvent-3.4-biosphere-ecoinvent-3.12-biosphere-transitive',\n", + " 'ecoinvent-3.4-biosphere-ecoinvent-3.5-biosphere',\n", + " 'ecoinvent-3.5-biosphere-ecoinvent-3.12-biosphere-transitive',\n", + " 'ecoinvent-3.5-biosphere-ecoinvent-3.6-biosphere',\n", + " 'ecoinvent-3.6-biosphere-ecoinvent-3.12-biosphere-transitive',\n", + " 'ecoinvent-3.6-biosphere-ecoinvent-3.7-biosphere',\n", + " 'ecoinvent-3.7-biosphere-ecoinvent-3.12-biosphere-transitive',\n", + " 'ecoinvent-3.7-biosphere-ecoinvent-3.8-biosphere',\n", + " 'ecoinvent-3.7.1-cutoff-ecoinvent-3.8-cutoff',\n", + " 'ecoinvent-3.8-biosphere-ecoinvent-3.12-biosphere-transitive',\n", + " 'ecoinvent-3.8-biosphere-ecoinvent-3.9-biosphere',\n", + " 'ecoinvent-3.8-cutoff-ecoinvent-3.9-cutoff',\n", + " 'ecoinvent-3.9.1-biosphere-EF-3.1-biosphere',\n", + " 'ecoinvent-3.9.1-biosphere-ecoinvent-3.10-biosphere',\n", + " 'ecoinvent-3.9.1-biosphere-ecoinvent-3.12-biosphere-transitive',\n", + " 'ecoinvent-3.9.1-cutoff-ecoinvent-3.10-cutoff',\n", + " 'generic-brightway-unit-conversions',\n", + " 'generic-brightway-units-normalization',\n", + " 'simapro-9-ecoinvent-3-context',\n", + " 'simapro-9-ecoinvent-3-water-slash-m3',\n", + " 'simapro-ecoinvent-3.10-cutoff',\n", + " 'simapro-ecoinvent-3.5-apos',\n", + " 'simapro-ecoinvent-3.5-consequential',\n", + " 'simapro-ecoinvent-3.5-cutoff',\n", + " 'simapro-ecoinvent-3.8-cutoff',\n", + " 'simapro-ecoinvent-3.9.1-cutoff']" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sorted(list(registry))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "4656eba5-4957-478d-a4c8-b853e0da8677", + "metadata": {}, + "outputs": [], + "source": [ + "previous = None\n", + "config = [\n", + " {\n", + " \"rd_source\": \"3.10.1\",\n", + " \"ei_source\": \"3.10.1\",\n", + " \"rd_target\": \"3.11\",\n", + " \"ei_target\": \"3.11\",\n", + " \"supplement\": True,\n", + " },\n", + " {\n", + " \"rd_source\": \"3.9.1\",\n", + " \"ei_source\": \"3.9.1\",\n", + " \"rd_target\": \"3.10\",\n", + " \"ei_target\": \"3.10.1\",\n", + " \"supplement\": True,\n", + " },\n", + " {\n", + " \"rd_source\": \"3.8\",\n", + " \"ei_source\": \"3.8\",\n", + " \"rd_target\": \"3.9\",\n", + " \"ei_target\": \"3.9.1\",\n", + " \"supplement\": True,\n", + " },\n", + " {\n", + " \"ei_source\": \"3.7\",\n", + " \"ei_target\": \"3.8\",\n", + " },\n", + " {\n", + " \"ei_source\": \"3.6\",\n", + " \"ei_target\": \"3.7\",\n", + " },\n", + " {\n", + " \"ei_source\": \"3.5\",\n", + " \"ei_target\": \"3.6\",\n", + " },\n", + " {\n", + " \"ei_source\": \"3.4\",\n", + " \"ei_target\": \"3.5\",\n", + " },\n", + " {\n", + " \"ei_source\": \"3.3\",\n", + " \"ei_target\": \"3.4\",\n", + " },\n", + " {\n", + " \"ei_source\": \"3.2\",\n", + " \"ei_target\": \"3.3\",\n", + " },\n", + " {\n", + " \"ei_source\": \"3.1\",\n", + " \"ei_target\": \"3.2\",\n", + " },\n", + " {\n", + " \"ei_source\": \"3.01\",\n", + " \"ei_target\": \"3.1\",\n", + " },\n", + "]\n", + "end_target = \"3.12\"" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "af525429-ba7a-42db-8509-ee279cfd70c8", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████████████| 11/11 [00:06<00:00, 1.78it/s]\n" + ] + } + ], + "source": [ + "for line in tqdm(config):\n", + " if line.get(\"supplement\"):\n", + " data = supplement_changes(\n", + " get_filtered_rd_changes(line[\"rd_source\"], line[\"rd_target\"]), \n", + " line[\"ei_source\"], \n", + " line[\"ei_target\"],\n", + " )\n", + " else:\n", + " data = generate_changes(line[\"ei_source\"], line[\"ei_target\"])\n", + "\n", + " if not line.get(\"supplement\") and data:\n", + " generate_datapackage(deepcopy(data), line[\"ei_source\"], line[\"ei_target\"])\n", + " \n", + " if previous is not None:\n", + " data = apply_forward_change(data, previous)\n", + "\n", + " generate_transitive_datapackage(deepcopy(data), line[\"ei_source\"], end_target)\n", + " previous = data" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "c1d13a46-ddc8-4420-8cc9-0dd42dbb5742", + "metadata": {}, + "outputs": [], + "source": [ + "data_22 = registry.get_file('ecoinvent-2.2-biosphere-ecoinvent-3.0-biosphere')['replace']\n", + "\n", + "data_301 = defaultdict(list)\n", + "\n", + "for obj in get_elem_flow_data(\"3.01\"):\n", + " data_301[obj['name']].append(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "8d94bc46-dc25-41d3-aef2-caf3b7ad1fb9", + "metadata": {}, + "outputs": [], + "source": [ + "changes = []\n", + "\n", + "for line in data_22:\n", + " s_name, t_name = line['source']['name'], line['target']['name']\n", + " if s_name == t_name:\n", + " continue\n", + " for obj in data_301.get(t_name, []):\n", + " source = {\n", + " \"name\": s_name,\n", + " \"context\": obj[\"context\"],\n", + " }\n", + " changes.append({\n", + " \"source\": source,\n", + " \"target\": obj,\n", + " \"comment\": \"Name change from ecoinvent 2.2 to 3.01\",\n", + " \"source_version\": \"ecoinvent-2.2-biosphere\",\n", + " \"target_version\": \"ecoinvent-3.01-biosphere\",\n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "e7adaffb-84ba-4359-a1c9-3af052772445", + "metadata": {}, + "outputs": [], + "source": [ + "data = apply_forward_change(changes, previous)\n", + "\n", + "dp = generate_transitive_datapackage(data, \"2.2\", end_target)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d325e762-f400-45fb-b432-754321e44b9d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/README.md b/README.md index eb724b4..d6c5bab 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ ontology that `flowmapper` uses: * context: tuple[str], a hierarchical organization into environmental compartments, e.g. `("air", "urban air close to ground")` * unit: str, or complex type with a string representation, e.g. "kg" * sector-specific labels: str, or complex type with a string representation, a set of additional fields which can help identify or further specify a flow, e.g. CAS number 000110-63-4 -* synonyms: list[str], a list of alternative unique names for a substance, e.g. `["Butylene glycol", "butane-1,4-diol"]` +* synonyms: list[str], a list of alternative unique names for a substance, e.g. `["Butylene glycol", "butane-1,4-diol"]`. Synonyms should not overlap. Flowmapper **assumes that the source and target lists are given in this format**; it comes with or plays well with conversion software for data formats like ecospold, FEDEFL, and SimaPro CSV. diff --git a/ecoinvent-3.10-biosphere-simapro-2024-biosphere.json b/ecoinvent-3.10-biosphere-simapro-2024-biosphere.json new file mode 100644 index 0000000..ad24edc --- /dev/null +++ b/ecoinvent-3.10-biosphere-simapro-2024-biosphere.json @@ -0,0 +1,1134 @@ +{ + "name": "ecoinvent-3.10-biosphere-simapro-2024-biosphere", + "description": "Manual SimaPro to ecoinvent name matches without conversion factors", + "contributors": [ + { + "title": "Chris Mutel", + "roles": [ + "author" + ], + "path": "https://chris.mutel.org" + } + ], + "created": "2025-11-16T22:04:33.433871+00:00", + "version": "1.0", + "licenses": [ + { + "name": "CC-BY-4.0", + "path": "https://creativecommons.org/licenses/by/4.0/legalcode", + "title": "Creative Commons Attribution 4.0 International" + } + ], + "graph_context": [ + "edges" + ], + "mapping": { + "source": { + "expression language": "XPath", + "labels": { + "name": "//*:elementaryExchange/*:name/text()", + "cas_number": "//*:elementaryExchange/@casNumber", + "unit": "//*:elementaryExchange/*:unitName/text()", + "identifier": "//*:elementaryExchange/@elementaryExchangeId", + "context": [ + "//*:elementaryExchange/*:compartment/*:compartment/text()", + "//*:elementaryExchange/*:compartment/*:subcompartment/text()" + ], + "synonyms": "//*:elementaryExchange/*:synonym/text()" + } + }, + "target": { + "expression language": "like JSONPath", + "labels": { + "identifier": "Process[*].\"Process identifier\".text", + "name": "Process[*].Products[*].text[0]", + "platform_id": "Process[*].\"Platform Identifier\"", + "unit": [ + "[\"Emissions to air/\", Process[*].\"Emissions to air\".[2]]", + "[\"Emissions to soil/\", Process[*].\"Emissions to soil\".[2]]", + "[\"Emissions to water/\", Process[*].\"Emissions to water\".[2]]", + "[\"Resources/\", Process[*].\"Resources\".[2]]" + ], + "context": [ + "[\"Emissions to air/\", Process[*].\"Emissions to air\".[1]]", + "[\"Emissions to soil/\", Process[*].\"Emissions to soil\".[1]]", + "[\"Emissions to water/\", Process[*].\"Emissions to water\".[1]]", + "[\"Resources/\", Process[*].\"Resources\".[1]]" + ] + } + } + }, + "source_id": "simapro-2024-biosphere", + "target_id": "ecoinvent-3.10-biosphere", + "update": [ + { + "source": { + "name": "Parathion, methyl" + }, + "target": { + "name": "Methyl parathion" + } + }, + { + "source": { + "name": "Thiocyanic acid (-1 ion)" + }, + "target": { + "name": "Thiocyanate" + } + }, + { + "source": { + "name": "Quizalofop ethyl ester" + }, + "target": { + "name": "Quizalofop-ethyl" + } + }, + { + "source": { + "name": "Prothioconazol" + }, + "target": { + "name": "Prothioconazole" + } + }, + { + "source": { + "name": "Pyraclostrobin (prop)" + }, + "target": { + "name": "Pyraclostrobin" + } + }, + { + "source": { + "name": "Monosodium acid methanearsonate" + }, + "target": { + "name": "MSMA" + } + }, + { + "source": { + "name": "Carbamic acid, [(dibutylamino)thio]methyl-, 2,3-dihydro-2,2-dimethyl-7-benzofuranyl ester" + }, + "target": { + "name": "Carbosulfan" + } + }, + { + "source": { + "name": "Benzene, 1-methyl-2-nitro-" + }, + "target": { + "name": "o-Nitrotoluene" + } + }, + { + "source": { + "name": "Alkane (unspecified)" + }, + "target": { + "name": "Hydrocarbons, aliphatic, alkanes, unspecified" + } + }, + { + "source": { + "name": "AOX (Adsorbable Organic Halogens)" + }, + "target": { + "name": "AOX, Adsorbable Organic Halides" + } + }, + { + "source": { + "name": "AOX, Adsorbable Organic Halogen as Cl" + }, + "target": { + "name": "AOX, Adsorbable Organic Halides" + } + }, + { + "source": { + "name": "BOD5 (Biological Oxygen Demand)" + }, + "target": { + "name": "BOD5, Biological Oxygen Demand" + } + }, + { + "source": { + "name": "COD (Chemical Oxygen Demand)" + }, + "target": { + "name": "COD, Chemical Oxygen Demand" + } + }, + { + "source": { + "name": "Wood, unspecified, standing/m3" + }, + "target": { + "name": "Wood, unspecified, standing" + } + }, + { + "source": { + "name": "Particulates, < 2.5 um" + }, + "target": { + "name": "Particulate Matter, < 2.5 um" + } + }, + { + "source": { + "name": "Particulates, > 10 um" + }, + "target": { + "name": "Particulate Matter, > 10 um" + } + }, + { + "source": { + "name": "Particulates, > 2.5 um, and < 10um" + }, + "target": { + "name": "Particulate Matter, > 2.5 um and < 10um" + } + }, + { + "source": { + "name": "Sand" + }, + "target": { + "name": "Sand, unspecified" + } + }, + { + "source": { + "name": "Potassium chloride" + }, + "target": { + "name": "Sylvite" + } + }, + { + "source": { + "name": "Sodium tetrahydroborate" + }, + "target": { + "name": "Sodium tetrahydridoborate" + } + }, + { + "source": { + "name": "Toluene, 2-chloro-" + }, + "target": { + "name": "o-Chlorotoluene" + } + }, + { + "source": { + "name": "Pentane, 2,2,4-trimethyl-" + }, + "target": { + "name": "2,2,4-Trimethylpentane" + } + }, + { + "source": { + "name": "Dioxin, 2,3,7,8 Tetrachlorodibenzo-p-" + }, + "target": { + "name": "Dioxins, measured as 2,3,7,8-tetrachlorodibenzo-p-dioxin" + } + }, + { + "source": { + "name": "Discarded fish, demersal" + }, + "target": { + "name": "Discarded fish, demersal, to ocean" + } + }, + { + "source": { + "name": "Methane, tetrachloro-, CFC-10" + }, + "target": { + "name": "Carbon tetrachloride" + } + }, + { + "source": { + "name": "Methane, tetrafluoro-, CFC-14" + }, + "target": { + "name": "Tetrafluoromethane" + } + }, + { + "source": { + "name": "Metolachlor, (S)" + }, + "target": { + "name": "Metolachlor" + } + }, + { + "source": { + "name": "Methane, chlorofluoro-, HCFC-31" + }, + "target": { + "name": "Chloro-fluoromethane" + } + }, + { + "source": { + "name": "Metam-sodium dihydrate" + }, + "target": { + "name": "Metam-sodium" + } + }, + { + "source": { + "name": "Gas, natural, 36 MJ per m3" + }, + "target": { + "name": "Gas, natural, in ground" + } + }, + { + "source": { + "name": "Gas, mine, off-gas, process, coal mining, 36 MJ per m3" + }, + "target": { + "name": "Gas, mine, off-gas, process, coal mining" + } + }, + { + "source": { + "name": "Discarded fish, pelagic" + }, + "target": { + "name": "Discarded fish, pelagic, to ocean" + } + }, + { + "source": { + "name": "Dipropylthiocarbamic acid S-ethyl ester" + }, + "target": { + "name": "EPTC" + } + }, + { + "source": { + "name": "Oxydemeton methyl" + }, + "target": { + "name": "Oxydemeton-methyl" + } + }, + { + "source": { + "name": "Thiazole, 2-(thiocyanatemethylthio)benzo-" + }, + "target": { + "name": "TCMTB" + } + }, + { + "source": { + "name": "Tri-allate" + }, + "target": { + "name": "Triallate" + } + }, + { + "source": { + "name": "Cesium (I)" + }, + "target": { + "name": "Caesium I" + } + }, + { + "source": { + "name": "Cesium" + }, + "target": { + "name": "Caesium" + } + }, + { + "source": { + "name": "Dimethyl formamide" + }, + "target": { + "name": "N,N-Dimethylformamide" + } + }, + { + "source": { + "name": "Methane, land transformation" + }, + "target": { + "name": "Methane, from soil or biomass stock" + } + }, + { + "source": { + "name": "Carbon dioxide, land transformation" + }, + "target": { + "name": "Carbon dioxide, from soil or biomass stock" + } + }, + { + "source": { + "name": "Carbon monoxide, land transformation" + }, + "target": { + "name": "Carbon monoxide, from soil or biomass stock" + } + }, + { + "source": { + "name": "Nitrogen, atmospheric" + }, + "target": { + "name": "Nitrogen" + } + }, + { + "source": { + "name": "Butyric acid, 4-(2,4-dichlorophenoxy)-" + }, + "target": { + "name": "2,4-DB" + } + }, + { + "source": { + "name": "Benzo(a)anthracene" + }, + "target": { + "name": "Benz(a)anthracene" + } + }, + { + "source": { + "name": "Oil, crude, 43.4 MJ per kg" + }, + "target": { + "name": "Oil, crude" + } + }, + { + "source": { + "name": "Argon-40/kg" + }, + "target": { + "name": "Argon" + } + }, + { + "source": { + "name": "1-Butanol" + }, + "target": { + "name": "Butanol" + } + }, + { + "source": { + "name": "Metaldehyde (tetramer)" + }, + "target": { + "name": "Metaldehyde" + } + }, + { + "source": { + "name": "Roundup" + }, + "target": { + "name": "Glyphosate" + } + }, + { + "source": { + "name": "Transformation, from pasture and meadow, organic" + }, + "target": { + "name": "Transformation, from pasture, man made, extensive" + } + }, + { + "source": { + "name": "Transformation, to pasture and meadow, organic" + }, + "target": { + "name": "Transformation, to pasture, man made, extensive" + } + }, + { + "source": { + "name": "Transformation, from arable, organic" + }, + "target": { + "name": "Transformation, from arable land, unspecified use" + } + }, + { + "source": { + "name": "Transformation, to arable, organic" + }, + "target": { + "name": "Transformation, to arable land, unspecified use" + } + }, + { + "source": { + "name": "Transformation, to industrial area, built up" + }, + "target": { + "name": "Transformation, to industrial area", + "context": [ + "natural resource", + "land" + ] + } + }, + { + "source": { + "name": "Transformation, from agriculture" + }, + "target": { + "name": "Transformation, from annual crop" + } + }, + { + "source": { + "name": "Transformation, from annual crop, non-irrigated, fallow" + }, + "target": { + "name": "Transformation, from pasture, man made" + } + }, + { + "source": { + "name": "Transformation, from forest, intensive, clear-cutting" + }, + "target": { + "name": "Transformation, from forest, intensive" + } + }, + { + "source": { + "name": "Transformation, from forest, used" + }, + "target": { + "name": "Transformation, from forest, intensive" + } + }, + { + "source": { + "name": "Transformation, from grassland" + }, + "target": { + "name": "Transformation, from grassland, natural (non-use)" + } + }, + { + "source": { + "name": "Transformation, from grassland/pasture/meadow" + }, + "target": { + "name": "Transformation, from pasture, man made" + } + }, + { + "source": { + "name": "Transformation, from industrial area, benthos" + }, + "target": { + "name": "Transformation, from seabed, unspecified" + } + }, + { + "source": { + "name": "Transformation, from industrial area, built up" + }, + "target": { + "name": "Transformation, from industrial area" + } + }, + { + "source": { + "name": "Transformation, from industrial area, vegetation" + }, + "target": { + "name": "Transformation, from industrial area" + } + }, + { + "source": { + "name": "Transformation, from permanent crop, fruit" + }, + "target": { + "name": "Transformation, from permanent crop, irrigated" + } + }, + { + "source": { + "name": "Transformation, from tropical rain forest" + }, + "target": { + "name": "Transformation, from forest, extensive" + } + }, + { + "source": { + "name": "Transformation, from unspecified, used" + }, + "target": { + "name": "Transformation, from unspecified" + } + }, + { + "source": { + "name": "Transformation, to agriculture" + }, + "target": { + "name": "Transformation, to annual crop" + } + }, + { + "source": { + "name": "Transformation, to annual crop, fallow" + }, + "target": { + "name": "Transformation, to arable land, unspecified use" + } + }, + { + "source": { + "name": "Transformation, to annual crop, non-irrigated, fallow" + }, + "target": { + "name": "Transformation, to annual crop, non-irrigated, extensive" + } + }, + { + "source": { + "name": "Transformation, to dump site, benthos" + }, + "target": { + "name": "Transformation, to seabed, unspecified" + } + }, + { + "source": { + "name": "Transformation, to forest, intensive, clear-cutting" + }, + "target": { + "name": "Transformation, to forest, intensive" + } + }, + { + "source": { + "name": "Transformation, to forest, intensive, normal" + }, + "target": { + "name": "Transformation, to forest, intensive" + } + }, + { + "source": { + "name": "Transformation, to forest, intensive, short-cycle" + }, + "target": { + "name": "Transformation, to forest, intensive" + } + }, + { + "source": { + "name": "Transformation, to forest, used" + }, + "target": { + "name": "Transformation, to forest, intensive" + } + }, + { + "source": { + "name": "Transformation, to grassland/pasture/meadow" + }, + "target": { + "name": "Transformation, to pasture, man made" + } + }, + { + "source": { + "name": "Transformation, to industrial area, benthos" + }, + "target": { + "name": "Transformation, to seabed, unspecified" + } + }, + { + "source": { + "name": "Transformation, to industrial area, vegetation" + }, + "target": { + "name": "Transformation, to industrial area" + } + }, + { + "source": { + "name": "Transformation, to permanent crop, fruit, intensive" + }, + "target": { + "name": "Transformation, to permanent crop, irrigated" + } + }, + { + "source": { + "name": "Transformation, to sea and ocean" + }, + "target": { + "name": "Transformation, to seabed, unspecified" + } + }, + { + "source": { + "name": "Transformation, to traffic area, road embankment" + }, + "target": { + "name": "Transformation, to traffic area, road network" + } + }, + { + "source": { + "name": "Transformation, to unspecified, used" + }, + "target": { + "name": "Transformation, to unspecified" + } + }, + { + "source": { + "name": "Transformation, to urban/industrial fallow" + }, + "target": { + "name": "Transformation, to industrial area" + } + }, + { + "source": { + "name": "Transformation, to water bodies, artificial" + }, + "target": { + "name": "Transformation, to river, artificial" + } + }, + { + "source": { + "name": "Transformation, to water courses, artificial" + }, + "target": { + "name": "Transformation, to river, artificial" + } + }, + { + "source": { + "name": "Transformation, to lakes, artificial" + }, + "target": { + "name": "Transformation, to lake, artificial" + } + }, + { + "source": { + "name": "Transformation, to rivers, artificial" + }, + "target": { + "name": "Transformation, to river, artificial" + } + }, + { + "source": { + "name": "Occupation, lakes, artificial" + }, + "target": { + "name": "Occupation, lake, artificial" + } + }, + { + "source": { + "name": "Occupation, rivers, artificial" + }, + "target": { + "name": "Occupation, river, artificial" + } + }, + { + "source": { + "name": "Occupation, water bodies, artificial" + }, + "target": { + "name": "Occupation, river, artificial" + } + }, + { + "source": { + "name": "Occupation, agriculture" + }, + "target": { + "name": "Occupation, annual crop" + } + }, + { + "source": { + "name": "Occupation, dump site, benthos" + }, + "target": { + "name": "Occupation, seabed, unspecified" + } + }, + { + "source": { + "name": "Occupation, forest, intensive, normal" + }, + "target": { + "name": "Occupation, forest, intensive" + } + }, + { + "source": { + "name": "Occupation, forest, intensive, short-cycle" + }, + "target": { + "name": "Occupation, forest, intensive" + } + }, + { + "source": { + "name": "Occupation, forest, used" + }, + "target": { + "name": "Occupation, forest, intensive" + } + }, + { + "source": { + "name": "Occupation, grassland/pasture/meadow" + }, + "target": { + "name": "Occupation, pasture, man made" + } + }, + { + "source": { + "name": "Occupation, industrial area, benthos" + }, + "target": { + "name": "Occupation, seabed, unspecified" + } + }, + { + "source": { + "name": "Occupation, industrial area, built up" + }, + "target": { + "name": "Occupation, industrial area" + } + }, + { + "source": { + "name": "Occupation, industrial area, vegetation" + }, + "target": { + "name": "Occupation, industrial area" + } + }, + { + "source": { + "name": "Occupation, permanent crop, fruit, intensive" + }, + "target": { + "name": "Occupation, permanent crop, irrigated" + } + }, + { + "source": { + "name": "Occupation, sea and ocean" + }, + "target": { + "name": "Occupation, seabed, unspecified" + } + }, + { + "source": { + "name": "Occupation, sea and ocean" + }, + "target": { + "name": "Occupation, seabed, unspecified" + } + }, + { + "source": { + "name": "Occupation, traffic area" + }, + "target": { + "name": "Occupation, traffic area, road network" + } + }, + { + "source": { + "name": "Occupation, traffic area, road embankment" + }, + "target": { + "name": "Occupation, traffic area, road network" + } + }, + { + "source": { + "name": "Occupation, unspecified, used" + }, + "target": { + "name": "Occupation, unspecified" + } + }, + { + "source": { + "name": "Occupation, water bodies, artificial" + }, + "target": { + "name": "Occupation, river, artificial" + } + }, + { + "source": { + "name": "Occupation, water courses, artificial" + }, + "target": { + "name": "Occupation, river, artificial" + } + }, + { + "source": { + "name": "Occupation, wetland" + }, + "target": { + "name": "Occupation, inland waterbody, unspecified" + } + }, + { + "source": { + "name": "Bauxite" + }, + "target": { + "name": "Gangue" + } + }, + { + "source": { + "name": "Copper ore" + }, + "target": { + "name": "Copper" + } + }, + { + "source": { + "name": "Copper, Cu 0.38%, Au 9.7E-4%, Ag 9.7E-4%, Zn 0.63%, Pb 0.014%, in ore" + }, + "target": { + "name": "Copper" + } + }, + { + "source": { + "name": "Copper, Cu 3.2E+0%, Pt 2.5E-4%, Pd 7.3E-4%, Rh 2.0E-5%, Ni 2.3E+0% in ore" + }, + "target": { + "name": "Copper" + } + }, + { + "source": { + "name": "Copper, Cu 5.2E-2%, Pt 4.8E-4%, Pd 2.0E-4%, Rh 2.4E-5%, Ni 3.7E-2% in ore" + }, + "target": { + "name": "Copper" + } + }, + { + "source": { + "name": "Coal, 18 MJ per kg" + }, + "target": { + "name": "Coal, hard" + } + }, + { + "source": { + "name": "Coal, brown, 10 MJ per kg" + }, + "target": { + "name": "Coal, brown" + } + }, + { + "source": { + "name": "Coal, brown, 8 MJ per kg" + }, + "target": { + "name": "Coal, brown" + } + }, + { + "source": { + "name": "Crude oil" + }, + "target": { + "name": "Oil, crude" + } + }, + { + "source": { + "name": "Energy, from biomass" + }, + "target": { + "name": "Energy, gross calorific value, in biomass" + } + }, + { + "source": { + "name": "Energy, from coal" + }, + "target": { + "name": "Energy, gross calorific value, in biomass" + } + }, + { + "source": { + "name": "Gas, mine, off-gas, process, coal mining/m3" + }, + "target": { + "name": "Gas, mine, off-gas, process, coal mining", + "unit": "Sm3" + } + }, + { + "source": { + "name": "Silver, Ag 9.7E-4%, Au 9.7E-4%, Zn 0.63%, Cu 0.38%, Pb 0.014%, in ore" + }, + "target": { + "name": "Silver" + } + }, + { + "source": { + "name": "Zinc, Zn 0.63%, Au 9.7E-4%, Ag 9.7E-4%, Cu 0.38%, Pb 0.014%, in ore" + }, + "target": { + "name": "Zinc" + } + }, + { + "source": { + "name": "Lead, Pb 0.014%, Au 9.7E-4%, Ag 9.7E-4%, Zn 0.63%, Cu 0.38%, in ore" + }, + "target": { + "name": "Lead" + } + }, + { + "source": { + "name": "Nickel, Ni 2.3E+0%, Pt 2.5E-4%, Pd 7.3E-4%, Rh 2.0E-5%, Cu 3.2E+0% in ore" + }, + "target": { + "name": "Nickel" + } + }, + { + "source": { + "name": "Nickel, Ni 3.7E-2%, Pt 4.8E-4%, Pd 2.0E-4%, Rh 2.4E-5%, Cu 5.2E-2% in ore" + }, + "target": { + "name": "Nickel" + } + }, + { + "source": { + "name": "Platinum, Pt 4.8E-4%, Pd 2.0E-4%, Rh 2.4E-5%, Ni 3.7E-2%, Cu 5.2E-2% in ore" + }, + "target": { + "name": "Platinum" + } + }, + { + "source": { + "name": "Gold, Au 9.7E-4%, Ag 9.7E-4%, Zn 0.63%, Cu 0.38%, Pb 0.014%, in ore" + }, + "target": { + "name": "Gold" + } + }, + { + "source": { + "name": "Platinum, Pt 2.5E-4%, Pd 7.3E-4%, Rh 2.0E-5%, Ni 2.3E+0%, Cu 3.2E+0% in ore" + }, + "target": { + "name": "Platinum" + } + }, + { + "source": { + "name": "Palladium, Pd 2.0E-4%, Pt 4.8E-4%, Rh 2.4E-5%, Ni 3.7E-2%, Cu 5.2E-2% in ore" + }, + "target": { + "name": "Palladium" + } + }, + { + "source": { + "name": "Palladium, Pd 7.3E-4%, Pt 2.5E-4%, Rh 2.0E-5%, Ni 2.3E+0%, Cu 3.2E+0% in ore" + }, + "target": { + "name": "Palladium" + } + }, + { + "source": { + "name": "Clay" + }, + "target": { + "name": "Clay, bentonite" + } + }, + { + "source": { + "name": "Rhodium, Rh 2.0E-5%, Pt 2.5E-4%, Pd 7.3E-4%, Ni 2.3E+0%, Cu 3.2E+0% in ore" + }, + "target": { + "name": "Rhodium" + } + }, + { + "source": { + "name": "Rhodium, Rh 2.4E-5%, Pt 4.8E-4%, Pd 2.0E-4%, Ni 3.7E-2%, Cu 5.2E-2% in ore" + }, + "target": { + "name": "Rhodium" + } + } + ] +} \ No newline at end of file diff --git a/file_logger.py b/file_logger.py new file mode 100644 index 0000000..1025eaa --- /dev/null +++ b/file_logger.py @@ -0,0 +1,337 @@ +import logging +import structlog +from pathlib import Path +from typing import Optional + + +def configure_file_logger( + logger_name: str, + log_file_path: str | Path, + log_level: int = logging.INFO, + log_format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s", + encoding: str = "utf-8", +) -> structlog.BoundLogger: + """ + Configure a structlog logger to log messages only to a file. + + Args: + logger_name: Name of the logger + log_file_path: Path to the log file + log_level: Logging level (default: INFO) + log_format: Format string for log messages + encoding: File encoding (default: utf-8) + + Returns: + structlog.BoundLogger: Configured structlog logger + + Example: + >>> logger = configure_file_logger("my_app", "logs/app.log") + >>> logger.info("Application started") + >>> logger.error("An error occurred", error_code=500) + """ + # Convert path to Path object if it's a string + log_file_path = Path(log_file_path) + + # Create log directory if it doesn't exist + log_file_path.parent.mkdir(parents=True, exist_ok=True) + + # Get the standard library logger + std_logger = logging.getLogger(logger_name) + std_logger.setLevel(log_level) + + # Remove only FileHandler handlers to avoid duplicates while preserving other handlers + for handler in std_logger.handlers[:]: + if isinstance(handler, logging.FileHandler): + std_logger.removeHandler(handler) + + # Create a simple file handler + file_handler = logging.FileHandler( + filename=log_file_path, + encoding=encoding, + ) + + # Create formatter + formatter = logging.Formatter(log_format) + file_handler.setFormatter(formatter) + + # Add handler to logger + std_logger.addHandler(file_handler) + + # Prevent propagation to root logger to avoid console output + std_logger.propagate = False + + # Get the structlog logger + logger = structlog.get_logger(logger_name) + + return logger + + +def configure_structured_file_logger( + logger_name: str, + log_file_path: str | Path, + log_level: int = logging.INFO, + encoding: str = "utf-8", + include_timestamp: bool = True, + include_logger_name: bool = True, + include_level: bool = True, +) -> structlog.BoundLogger: + """ + Configure a structlog logger with structured logging to a file. + + Args: + logger_name: Name of the logger + log_file_path: Path to the log file + log_level: Logging level (default: INFO) + encoding: File encoding (default: utf-8) + include_timestamp: Whether to include timestamp in logs (default: True) + include_logger_name: Whether to include logger name in logs (default: True) + include_level: Whether to include log level in logs (default: True) + + Returns: + structlog.BoundLogger: Configured structlog logger with structured logging + + Example: + >>> logger = configure_structured_file_logger("my_app", "logs/app.json") + >>> logger.info("User logged in", user_id=123, ip="192.168.1.1") + """ + import json + + # Convert path to Path object if it's a string + log_file_path = Path(log_file_path) + + # Create log directory if it doesn't exist + log_file_path.parent.mkdir(parents=True, exist_ok=True) + + # Get the standard library logger + std_logger = logging.getLogger(logger_name) + std_logger.setLevel(log_level) + + # Remove only FileHandler handlers to avoid duplicates while preserving other handlers + for handler in std_logger.handlers[:]: + if isinstance(handler, logging.FileHandler): + std_logger.removeHandler(handler) + + # Create a simple file handler + file_handler = logging.FileHandler( + filename=log_file_path, + encoding=encoding, + ) + + # Create JSON formatter for structured logging + class JSONFormatter(logging.Formatter): + def format(self, record): + log_entry = { + "message": record.getMessage(), + } + + if include_timestamp: + log_entry["timestamp"] = self.formatTime(record) + + if include_logger_name: + log_entry["logger"] = record.name + + if include_level: + log_entry["level"] = record.levelname + + # Add any extra fields from structlog + if hasattr(record, "structlog"): + log_entry.update(record.structlog) + + return json.dumps(log_entry) + + formatter = JSONFormatter() + file_handler.setFormatter(formatter) + + # Add handler to logger + std_logger.addHandler(file_handler) + + # Prevent propagation to root logger to avoid console output + std_logger.propagate = False + + # Get the structlog logger + logger = structlog.get_logger(logger_name) + + return logger + + +def get_file_logger( + logger_name: str, + log_file_path: str | Path, + structured: bool = False, + **kwargs +) -> structlog.BoundLogger: + """ + Convenience function to get a file logger with either standard or structured logging. + + Args: + logger_name: Name of the logger + log_file_path: Path to the log file + structured: Whether to use structured (JSON) logging (default: False) + **kwargs: Additional arguments passed to the configuration function + + Returns: + structlog.BoundLogger: Configured structlog logger + + Example: + >>> # Standard logging + >>> logger = get_file_logger("app", "logs/app.log") + >>> + >>> # Structured logging + >>> logger = get_file_logger("app", "logs/app.json", structured=True) + """ + if structured: + return configure_structured_file_logger(logger_name, log_file_path, **kwargs) + else: + return configure_file_logger(logger_name, log_file_path, **kwargs) + + +def reset_logger_to_defaults(logger_name: str) -> structlog.BoundLogger: + """ + Reset a named structlog logger to its default configuration. + + This function removes all custom handlers and resets the logger to use + the default structlog configuration, which typically outputs to console. + + Args: + logger_name: Name of the logger to reset + + Returns: + structlog.BoundLogger: Reset structlog logger + + Example: + >>> # Configure a file logger + >>> logger = configure_file_logger("my_app", "logs/app.log") + >>> logger.info("This goes to file") + >>> + >>> # Reset to defaults (console output) + >>> logger = reset_logger_to_defaults("my_app") + >>> logger.info("This goes to console") + """ + # Get the standard library logger + std_logger = logging.getLogger(logger_name) + + # Remove all existing handlers + for handler in std_logger.handlers[:]: + std_logger.removeHandler(handler) + + # Reset logger level to default (NOTSET) + std_logger.setLevel(logging.NOTSET) + + # Re-enable propagation to parent loggers + std_logger.propagate = True + + # Get the structlog logger (this will use default structlog configuration) + logger = structlog.get_logger(logger_name) + + return logger + + +def reset_all_loggers_to_defaults() -> None: + """ + Reset all loggers to their default configuration. + + This function removes all custom handlers from all loggers and resets + them to use the default structlog configuration. + + Example: + >>> # Configure multiple file loggers + >>> logger1 = configure_file_logger("app1", "logs/app1.log") + >>> logger2 = configure_file_logger("app2", "logs/app2.log") + >>> + >>> # Reset all loggers to defaults + >>> reset_all_loggers_to_defaults() + >>> + >>> # Now all loggers will output to console by default + >>> logger1.info("This goes to console") + >>> logger2.info("This also goes to console") + """ + # Get all existing loggers + loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict] + + # Also include the root logger + loggers.append(logging.getLogger()) + + for logger in loggers: + # Remove all existing handlers + for handler in logger.handlers[:]: + logger.removeHandler(handler) + + # Reset logger level to default + logger.setLevel(logging.NOTSET) + + # Re-enable propagation + logger.propagate = True + + +def get_logger_info(logger_name: str) -> dict: + """ + Get information about a logger's current configuration. + + Args: + logger_name: Name of the logger to inspect + + Returns: + dict: Information about the logger's configuration + + Example: + >>> logger = configure_file_logger("my_app", "logs/app.log") + >>> info = get_logger_info("my_app") + >>> print(info) + >>> # Output: {'name': 'my_app', 'level': 20, 'handlers': 1, 'propagate': False} + """ + std_logger = logging.getLogger(logger_name) + + return { + "name": logger_name, + "level": std_logger.level, + "handlers": len(std_logger.handlers), + "propagate": std_logger.propagate, + "handler_types": [type(handler).__name__ for handler in std_logger.handlers], + } + + +# Example usage and testing +if __name__ == "__main__": + # Example 1: Standard file logging + logger1 = configure_file_logger("test_app", "logs/test.log") + logger1.info("This is a test message") + logger1.error("This is an error message", error_code=500) + + # Example 2: Structured file logging + logger2 = configure_structured_file_logger("test_app_structured", "logs/test.json") + logger2.info("User action", user_id=123, action="login", ip="192.168.1.1") + logger2.error("Database error", error_code=500, table="users", query="SELECT *") + + # Example 3: Using convenience function + logger3 = get_file_logger("convenience_app", "logs/convenience.log") + logger3.info("Using convenience function") + + logger4 = get_file_logger("convenience_structured", "logs/convenience.json", structured=True) + logger4.info("Structured logging with convenience", event="test", data={"key": "value"}) + + # Example 4: Demonstrating reset functionality + print("\n=== Testing Reset Functionality ===") + + # Show logger info before reset + print("Before reset:") + print(f"test_app logger info: {get_logger_info('test_app')}") + + # Reset specific logger + reset_logger = reset_logger_to_defaults("test_app") + reset_logger.info("This message goes to console (after reset)") + + # Show logger info after reset + print("After reset:") + print(f"test_app logger info: {get_logger_info('test_app')}") + + # Example 5: Reset all loggers + print("\n=== Resetting All Loggers ===") + reset_all_loggers_to_defaults() + + # All loggers now use default configuration + logger1.info("This also goes to console now") + logger2.info("This also goes to console now") + + print("\nLog files created successfully!") + print("Check the 'logs' directory for the generated log files.") + print("After reset, all loggers output to console by default.") \ No newline at end of file diff --git a/flowmapper/__init__.py b/flowmapper/__init__.py deleted file mode 100644 index c8c0b81..0000000 --- a/flowmapper/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -__all__ = ( - "__version__", - "CASField", - "ContextField", - "Flow", - "Flowmap", - "flowmapper", - "OutputFormat", - "UnitField", -) - -__version__ = "0.4.2" - -from flowmapper.cas import CASField -from flowmapper.context import ContextField -from flowmapper.flow import Flow -from flowmapper.flowmap import Flowmap -from flowmapper.main import OutputFormat, flowmapper -from flowmapper.unit import UnitField diff --git a/flowmapper/cas.py b/flowmapper/cas.py deleted file mode 100644 index b243365..0000000 --- a/flowmapper/cas.py +++ /dev/null @@ -1,67 +0,0 @@ -from functools import cached_property - - -class CASField: - """ - Class for CAS Registry Numbers that accepts padded or non-padded strings - """ - - def __init__(self, cas: str | None): - if not isinstance(cas, str) and cas is not None: - raise TypeError(f"cas should be a str, not {type(cas).__name__}") - else: - self.original = cas - self.transformed = ("" if cas is None else cas).strip().lstrip("0").strip() - self.digits = tuple(int(d) for d in self.transformed.replace("-", "")) - - @property - def export(self): - if self.original: - return "{}-{}-{}".format( - "".join([str(x) for x in self.digits[:-3]]), - "".join([str(x) for x in self.digits[-3:-1]]), - self.digits[-1], - ) - else: - return "" - - def __repr__(self): - if not self.original: - return "CASField with missing original value" - else: - return "{} CASField: '{}' -> '{}'".format( - "Valid" if self.valid else "Invalid", self.original, self.export - ) - - def __eq__(self, other): - if isinstance(other, CASField): - return self.original and self.digits == other.digits - if isinstance(other, str): - try: - return self.digits == CASField(other).digits - except (TypeError, ValueError): - return False - return False - - @cached_property - def check_digit_expected(self): - """ - Expected digit acording to https://www.cas.org/support/documentation/chemical-substances/checkdig algorithm - """ - result = ( - sum( - [ - index * value - for index, value in enumerate(self.digits[::-1], start=1) - ] - ) - % 10 - ) - return result - - @property - def valid(self): - """ - True if check if CAS number is valid acording to https://www.cas.org/support/documentation/chemical-substances/checkdig algorithm - """ - return self.digits[-1] == self.check_digit_expected diff --git a/flowmapper/context.py b/flowmapper/context.py deleted file mode 100644 index 941c44f..0000000 --- a/flowmapper/context.py +++ /dev/null @@ -1,81 +0,0 @@ -from collections.abc import Iterable -from typing import Any - -MISSING_VALUES = { - "", - "(unknown)", - "(unspecified)", - "null", - "unknown", - "unspecified", -} - - -class ContextField(Iterable): - def __init__(self, original: Any, transformed: Any = None): - self.original = original - self.transformed = transformed or original - self.normalized = self.normalize(self.transformed) - - def normalize(self, value: Any) -> tuple[str, ...]: - if isinstance(value, (tuple, list)): - intermediate = list(value) - elif isinstance(value, str) and "/" in value: - intermediate = list(value.split("/")) - elif isinstance(value, str): - intermediate = [value] - else: - raise ValueError(f"Can't understand input context {value}") - - intermediate = [elem.lower().strip() for elem in intermediate] - - if intermediate[-1] in MISSING_VALUES: - intermediate = intermediate[:-1] - - return tuple(intermediate) - - def export_as_string(self): - if isinstance(self.original, str): - return self.original - elif isinstance(self.original, (list, tuple)): - return "✂️".join(self.original) - else: - # Only reachable by manually changing `self.original` - raise ValueError("Invalid context data") - - def __iter__(self): - return iter(self.normalized) - - def __eq__(self, other): - if self and other and isinstance(other, ContextField): - return self.original and self.normalized == other.normalized - else: - try: - normalized_other = self.normalize(other) - return (self.normalized == normalized_other) or ( - self.original == normalized_other - ) - except ValueError: - return False - - def __repr__(self): - return f"ContextField: '{self.original}' -> '{self.normalized}'" - - def __bool__(self): - return bool(self.normalized) - - def __hash__(self): - return hash(self.normalized) - - def __contains__(self, other): - """This context is more generic than the `other` context. - - ```python - Context("a/b/c") in Context("a/b") - >>> True - ``` - - """ - if not isinstance(other, ContextField): - return False - return self.normalized == other.normalized[: len(self.normalized)] diff --git a/flowmapper/extraction/__init__.py b/flowmapper/extraction/__init__.py deleted file mode 100644 index 05a1d14..0000000 --- a/flowmapper/extraction/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# from .ecoinvent import ecoinvent_biosphere_extractor -from .ecospold2 import ecospold2_biosphere_extractor -from .simapro_csv import simapro_csv_biosphere_extractor -from .simapro_ecospold1 import simapro_ecospold1_biosphere_extractor diff --git a/flowmapper/flow.py b/flowmapper/flow.py deleted file mode 100644 index db40b6f..0000000 --- a/flowmapper/flow.py +++ /dev/null @@ -1,89 +0,0 @@ -from typing import List - -from flowmapper.cas import CASField -from flowmapper.context import ContextField -from flowmapper.string_field import StringField -from flowmapper.string_list import StringList -from flowmapper.unit import UnitField -from flowmapper.utils import apply_transformations, generate_flow_id - - -class Flow: - def __init__( - self, - data: dict, - transformations: List[dict] | None = None, - ): - # Hash of sorted dict keys and values - self.id = generate_flow_id(data) - self.data = data - self.transformed = apply_transformations(data, transformations) - self.conversion_factor = self.transformed.get("conversion_factor") - self.identifier = StringField( - original=self.data.get("identifier"), - transformed=self.transformed.get("identifier"), - use_lowercase=False, - ) - self.name = StringField( - original=self.data.get("name"), - transformed=self.transformed.get("name"), - ) - self.unit = UnitField( - original=self.data.get("unit"), - transformed=self.transformed.get("unit"), - ) - self.context = ContextField( - original=self.data.get("context"), - transformed=self.transformed.get("context"), - ) - self.cas = CASField(data.get("CAS number")) - self.synonyms = StringList( - original=self.data.get("synonyms", []), - transformed=self.transformed.get("synonyms", []), - ) - - @property - def uniqueness_id(self): - tupleize = lambda x: tuple(x) if isinstance(x, list) else x - return ( - self.name.original, - tupleize(self.context.original), - self.unit.original, - self.identifier.original, - ) - - @property - def missing(self): - """This flow has been marked as missing in target list""" - return self.transformed.get("__missing__") - - @property - def export(self) -> dict: - return { - k: v - for k, v in [ - ("name", self.name.original), - ("unit", self.unit.original), - ("identifier", self.identifier.original), - ("context", self.context.original), - ("CAS number", self.cas.export), - ] - if v - } - - def __repr__(self) -> str: - return f"""Flow object: - Identifier: {self.identifier} - Name: {self.name} - Context: {self.context} - Unit: {self.unit}""" - - def __eq__(self, other): - return self.id == other.id - - def __hash__(self): - return hash(self.id) - - # Used in sorting - def __lt__(self, other): - return self.name.normalized < other.name.normalized diff --git a/flowmapper/flowmap.py b/flowmapper/flowmap.py deleted file mode 100644 index 37b7fb2..0000000 --- a/flowmapper/flowmap.py +++ /dev/null @@ -1,554 +0,0 @@ -import math -import warnings -from collections import Counter -from functools import cached_property -from numbers import Number -from pathlib import Path -from typing import Callable, Optional, Union - -import pandas as pd -import pint -import randonneur -from tqdm import tqdm - -from flowmapper import __version__ -from flowmapper.errors import DifferingConversions, DifferingMatches -from flowmapper.flow import Flow -from flowmapper.match import format_match_result, match_rules -from flowmapper.utils import match_sort_order - - -def source_flow_id(obj: Flow, ensure_id: bool = False) -> str: - return ( - str(obj.identifier.original or "") - if (obj.identifier.original or not ensure_id) - else str(obj.id or "") - ) - - -class Flowmap: - """ - Crosswalk of flows from a source flow list to a target flow list. - - This class provides functionalities to map flows between different flow lists using a series of predefined match rules. - - Attributes - ---------- - source_flows : list[Flow] - The list of (unique) source flows to be mapped. - source_flows_nomatch : list[Flow] - The list of (unique) source flows that do not match any rule. - target_flows : list[Flow] - The list of target flows for mapping. - target_flows_nomatch : list[Flow] - The list of target flows that do not match any rule. - - """ - - def __init__( - self, - source_flows: list[Flow], - target_flows: list[Flow], - rules: list[Callable[..., bool]] = None, - nomatch_rules: list[Callable[..., bool]] = None, - disable_progress: bool = False, - ): - """ - Initializes the Flowmap with source and target flows, along with optional matching rules. - - Duplicated flows are removed from both source and targets lists. - - Parameters - ---------- - source_flows : list[Flow] - The list of source flows to be mapped. - target_flows : list[Flow] - The list of target flows for mapping. - rules : list[Callable[..., bool]], optional - Custom rules for matching source flows to target flows. Default is the set of rules defined in `match_rules`. - nomatch_rules : list[Callable[..., bool]], optional - Rules to identify flows that should not be matched. - disable_progress : bool, optional - If True, progress bar display during the mapping process is disabled. - - """ - self.disable_progress = disable_progress - self.rules = rules if rules else match_rules() - if nomatch_rules: - self.source_flows = [] - self.source_flows_nomatch = [] - - for flow in source_flows: - matched = False - for rule in nomatch_rules: - if rule(flow): - self.source_flows_nomatch.append(flow) - matched = True - break - if not matched: - self.source_flows.append(flow) - self.source_flows = list(dict.fromkeys(self.source_flows)) - self.source_flows_nomatch = list(dict.fromkeys(self.source_flows_nomatch)) - - self.target_flows = [] - self.target_flows_nomatch = [] - - for flow in target_flows: - matched = False - for rule in nomatch_rules: - if rule(flow): - self.target_flows_nomatch.append(flow) - matched = True - break - if not matched: - self.target_flows.append(flow) - self.target_flows = list(dict.fromkeys(self.target_flows)) - self.target_flows_nomatch = list(dict.fromkeys(self.target_flows_nomatch)) - else: - self.source_flows = list(dict.fromkeys(source_flows)) - self.source_flows_nomatch = [] - self.target_flows = list(dict.fromkeys(target_flows)) - self.target_flows_nomatch = [] - - def get_single_match( - self, source: Flow, target_flows: list, rules: list - ) -> Union[dict, None]: - """ - Try to find a single match for `source` in `target_flows` using `rules`. - - Adds to `all_mappings` if found. - """ - - def get_conversion_factor(s: Flow, t: Flow, data: dict) -> float | None: - cf_data = data.get("conversion_factor") - cf_s = s.conversion_factor - if cf_data and cf_s: - return cf_data * cf_s - elif cf_data or cf_s: - return cf_data or cf_s - else: - return s.unit.conversion_factor(t.unit) - - for target in target_flows: - for rule in rules: - is_match = rule(source, target) - if is_match: - try: - return { - "from": source, - "to": target, - "conversion_factor": get_conversion_factor( - source, target, is_match - ), - "match_rule": rule.__name__, - "match_rule_priority": self.rules.index(rule), - "info": is_match, - } - except pint.errors.UndefinedUnitError: - warnings.warng( - f"Pint Units error converting source {source.export} to target {target.export}" - ) - raise - - @cached_property - def mappings(self): - """ - Generates and returns a list of mappings from source flows to target flows based on the defined rules. - - Each mapping includes the source flow, target flow, conversion factor, the rule that determined the match, and additional information. - - A single match using the match rule with highest priority is returned for each source flow. - - Returns - ------- - list[dict] - A list of dictionaries containing the mapping details. - - """ - results = [ - self.get_single_match( - source=source, target_flows=self.target_flows, rules=self.rules - ) - for source in tqdm(self.source_flows, disable=self.disable_progress) - ] - - result, seen_sources, seen_combos = [], set(), {} - for mapping in sorted([elem for elem in results if elem], key=match_sort_order): - from_id = mapping["from"].uniqueness_id - combo_key = (from_id, mapping["to"].uniqueness_id) - if combo_key in seen_combos: - other = seen_combos[combo_key] - if ( - isinstance(other["conversion_factor"], Number) - and isinstance(mapping["conversion_factor"], Number) - and not math.isclose( - other["conversion_factor"], - mapping["conversion_factor"], - 1e-5, - 1e-5, - ) - ): - raise DifferingConversions( - f""" -Found two different conversion factors for the same match from - -{mapping['from']} - -to - -{mapping['to']} - -Conversion factors: - {other['match_rule']}: {other['conversion_factor']} - {mapping['match_rule']}: {mapping['conversion_factor']} -""" - ) - elif not isinstance(other["conversion_factor"], Number) and isinstance( - mapping["conversion_factor"], Number - ): - seen_combos[combo_key] = mapping - elif from_id in seen_sources: - other = next( - value for key, value in seen_combos.items() if key[0] == from_id - ) - raise DifferingMatches( - f""" -{mapping['from']} - -Matched to multiple targets, including: - -Match rule: {mapping['match_rule']}: -{mapping['to']} - -Match rule: {other['match_rule']} -{other['to']} -""" - ) - else: - seen_sources.add(from_id) - seen_combos[combo_key] = mapping - result.append(mapping) - - return result - - @cached_property - def _matched_source_flows_ids(self): - return {map_entry["from"].id for map_entry in self.mappings} - - @cached_property - def _matched_target_flows_ids(self): - return {map_entry["to"].id for map_entry in self.mappings} - - @cached_property - def matched_source(self): - """ - Provides a list of source flows that have been successfully matched to target flows. - - Returns - ------- - list[Flow] - A list of matched source flow objects. - - """ - result = [ - flow - for flow in self.source_flows - if flow.id in self._matched_source_flows_ids - ] - return result - - @cached_property - def unmatched_source(self): - """ - Provides a list of source flows that have not been matched to any target flows. - - Returns - ------- - list[Flow] - A list of unmatched source flow objects. - - """ - result = [ - flow - for flow in self.source_flows - if flow.id not in self._matched_source_flows_ids - ] - return result - - @cached_property - def matched_source_statistics(self): - """ - Calculates statistics for matched source flows, including the number of matches and the matching percentage for each context. - - Returns - ------- - pandas.DataFrame - A DataFrame containing matching statistics for source flows. - - """ - matched = Counter([flow.context.value for flow in self.matched_source]) - matched = pd.Series(matched).reset_index() - matched.columns = ["context", "matched"] - - total = Counter([flow.context.value for flow in self.source_flows]) - total = pd.Series(total).reset_index() - total.columns = ["context", "total"] - - df = pd.merge(matched, total, on="context", how="outer") - df = df.fillna(0).astype({"matched": "int", "total": "int"}) - - df["percent"] = df.matched / df.total - result = df.sort_values("percent") - return result - - @cached_property - def matched_target(self): - """ - Provides a list of target flows that have been successfully matched to source flows. - - Returns - ------- - list[Flow] - A list of matched target flow objects. - - """ - result = [ - flow - for flow in self.target_flows - if flow.id in self._matched_target_flows_ids - ] - return result - - @cached_property - def unmatched_target(self): - """ - Provides a list of target flows that have not been matched to any source flows. - - Returns - ------- - list[Flow] - A list of unmatched target flow objects. - - """ - result = [ - flow - for flow in self.target_flows - if flow.id not in self._matched_target_flows_ids - ] - return result - - @cached_property - def matched_target_statistics(self): - """ - Calculates statistics for matched target flows, including the number of matches and the matching percentage for each context. - - Returns - ------- - pandas.DataFrame - A DataFrame containing matching statistics for target flows. - - """ - matched = Counter([flow.context.value for flow in self.matched_target]) - matched = pd.Series(matched).reset_index() - matched.columns = ["context", "matched"] - - total = Counter([flow.context.value for flow in self.target_flows]) - total = pd.Series(total).reset_index() - total.columns = ["context", "total"] - - df = pd.merge(matched, total, on="context", how="outer") - df = df.fillna(0).astype({"matched": "int", "total": "int"}) - - df["percent"] = df.matched / df.total - result = df.sort_values("percent") - return result - - def statistics(self): - """ - Prints out summary statistics for the flow mapping process. - - """ - source_msg = ( - f"{len(self.source_flows)} source flows ({len(self.source_flows_nomatch)} excluded)..." - if self.source_flows_nomatch - else f"{len(self.source_flows)} source flows..." - ) - print(source_msg) - target_msg = ( - f"{len(self.target_flows)} target flows ({len(self.target_flows_nomatch)} excluded)..." - if self.target_flows_nomatch - else f"{len(self.target_flows)} target flows..." - ) - print(target_msg) - print( - f"{len(self.mappings)} mappings ({len(self.matched_source) / len(self.source_flows):.2%} of total)." - ) - cardinalities = dict(Counter([x["cardinality"] for x in self._cardinalities])) - print(f"Mappings cardinalities: {str(cardinalities)}") - - @cached_property - def _cardinalities(self): - """ - Calculates and returns the cardinalities of mappings between source and target flows. - - Returns - ------- - list[dict] - A sorted list of dictionaries, each indicating the cardinality relationship between a pair of source and target flows. - - """ - mappings = [ - (mapentry["from"].id, mapentry["to"].id) for mapentry in self.mappings - ] - lhs_counts = Counter([pair[0] for pair in mappings]) - rhs_counts = Counter([pair[1] for pair in mappings]) - - result = [] - - for lhs, rhs in mappings: - lhs_count = lhs_counts[lhs] - rhs_count = rhs_counts[rhs] - if lhs_count == 1 and rhs_count == 1: - result.append({"from": lhs, "to": rhs, "cardinality": "1:1"}) - elif lhs_count == 1 and rhs_count > 1: - result.append({"from": lhs, "to": rhs, "cardinality": "N:1"}) - elif lhs_count > 1 and rhs_count == 1: - result.append({"from": lhs, "to": rhs, "cardinality": "1:N"}) - elif lhs_count > 1 and rhs_count > 1: - result.append({"from": lhs, "to": rhs, "cardinality": "N:M"}) - - return sorted(result, key=lambda x: x["from"]) - - def to_randonneur( - self, - source_id: str, - target_id: str, - contributors: list, - mapping_source: dict, - mapping_target: dict, - version: str = "1.0.0", - licenses: Optional[list] = None, - homepage: Optional[str] = None, - name: Optional[str] = None, - path: Optional[Path] = None, - ) -> randonneur.Datapackage: - """ - Export mappings using randonneur data migration file format. - - Parameters - ---------- - path : Path, optional - If provided export the output file to disk. - - Returns - ------- - randonneur.Datapackage object. - - """ - dp = randonneur.Datapackage( - name=name or f"{source_id}-{target_id}", - source_id=source_id, - target_id=target_id, - description=f"Flowmapper {__version__} elementary flow correspondence from {source_id} to {target_id}", - contributors=contributors, - mapping_source=mapping_source, - mapping_target=mapping_target, - homepage=homepage, - version=version, - licenses=licenses, - ) - - result = [ - format_match_result( - map_entry["from"], - map_entry["to"], - map_entry["conversion_factor"], - map_entry["info"], - ) - for map_entry in self.mappings - ] - - dp.add_data(verb="update", data=result) - - if path is not None: - dp.to_json(path) - return dp - - def to_glad( - self, - path: Optional[Path] = None, - ensure_id: bool = False, - missing_source: bool = False, - ): - """ - Export mappings using GLAD flow mapping format, optionally ensuring each flow has an identifier. - - Formats the mapping results according to Global LCA Data Access (GLAD) network initiative flow mapping format. - - Parameters - ---------- - path : Path, optional - If provided export the output file to disk. - ensure_id : bool, optional - If True, ensures each flow has an identifier, default is False. - - Returns - ------- - pandas.DataFrame - A DataFrame containing the formatted mapping results in GLAD format. - - """ - data = [] - for map_entry in self.mappings: - data.append( - { - "SourceFlowName": map_entry["from"].name.original, - "SourceFlowUUID": source_flow_id( - map_entry["from"], ensure_id=ensure_id - ), - "SourceFlowContext": map_entry["from"].context.export_as_string(), - "SourceUnit": map_entry["from"].unit.original, - "MatchCondition": "=", - "ConversionFactor": map_entry["conversion_factor"], - "TargetFlowName": map_entry["to"].name.original, - "TargetFlowUUID": map_entry["to"].identifier.original, - "TargetFlowContext": map_entry["to"].context.export_as_string(), - "TargetUnit": map_entry["to"].unit.original, - "MemoMapper": map_entry["info"].get("comment"), - } - ) - - if missing_source: - for flow_obj in self.unmatched_source: - data.append( - { - "SourceFlowName": flow_obj.name.original, - "SourceFlowUUID": source_flow_id(flow_obj, ensure_id=ensure_id), - "SourceFlowContext": flow_obj.context.export_as_string(), - "SourceUnit": flow_obj.unit.original, - } - ) - - result = pd.DataFrame(data) - - if not path: - return result - else: - path = Path(path) - path.parent.mkdir(parents=True, exist_ok=True) - - writer = pd.ExcelWriter( - path, - engine="xlsxwriter", - engine_kwargs={"options": {"strings_to_formulas": False}}, - ) - result.to_excel(writer, sheet_name="Mapping", index=False, na_rep="NaN") - - for column in result: - column_length = max( - result[column].astype(str).map(len).max(), len(column) - ) - col_idx = result.columns.get_loc(column) - writer.sheets["Mapping"].set_column(col_idx, col_idx, column_length) - - writer.close() diff --git a/flowmapper/main.py b/flowmapper/main.py deleted file mode 100644 index e3a4330..0000000 --- a/flowmapper/main.py +++ /dev/null @@ -1,150 +0,0 @@ -import json -import logging -from enum import Enum -from pathlib import Path -from typing import Optional - -from flowmapper.flow import Flow -from flowmapper.flowmap import Flowmap -from flowmapper.transformation_mapping import prepare_transformations -from flowmapper.utils import load_standard_transformations, read_migration_files - -logger = logging.getLogger(__name__) - - -def sorting_function(obj: dict) -> tuple: - return ( - obj.get("name", "ZZZ"), - str(obj.get("context", "ZZZ")), - obj.get("unit", "ZZZ"), - ) - - -class OutputFormat(str, Enum): - all = "all" - glad = "glad" - randonneur = "randonneur" - - -def flowmapper( - source: Path, - target: Path, - mapping_source: dict, - mapping_target: dict, - source_id: str, - target_id: str, - contributors: list, - output_dir: Path, - format: OutputFormat, - version: str = "1.0.0", - default_transformations: bool = True, - transformations: Optional[list[Path | str]] = None, - unmatched_source: bool = True, - unmatched_target: bool = True, - matched_source: bool = False, - matched_target: bool = False, - licenses: Optional[list] = None, - homepage: Optional[str] = None, - name: Optional[str] = None, -) -> Flowmap: - """ - Generate mappings between elementary flows lists - """ - output_dir.mkdir(parents=True, exist_ok=True) - - loaded_transformations = [] - if default_transformations: - loaded_transformations.extend(load_standard_transformations()) - if transformations: - loaded_transformations.extend(read_migration_files(*transformations)) - - prepared_transformations = prepare_transformations(loaded_transformations) - - source_flows = [ - Flow(flow, prepared_transformations) for flow in json.load(open(source)) - ] - source_flows = [flow for flow in source_flows if not flow.missing] - target_flows = [ - Flow(flow, prepared_transformations) for flow in json.load(open(target)) - ] - - flowmap = Flowmap(source_flows, target_flows) - flowmap.statistics() - - stem = f"{source.stem}-{target.stem}" - - if matched_source: - with open(output_dir / f"{stem}-matched-source.json", "w") as fs: - json.dump( - sorted( - [flow.export for flow in flowmap.matched_source], - key=sorting_function, - ), - fs, - indent=True, - ) - - if unmatched_source: - with open(output_dir / f"{stem}-unmatched-source.json", "w") as fs: - json.dump( - sorted( - [flow.export for flow in flowmap.unmatched_source], - key=sorting_function, - ), - fs, - indent=True, - ) - - if matched_target: - with open(output_dir / f"{stem}-matched-target.json", "w") as fs: - json.dump( - sorted( - [flow.export for flow in flowmap.matched_target], - key=sorting_function, - ), - fs, - indent=True, - ) - - if unmatched_target: - with open(output_dir / f"{stem}-unmatched-target.json", "w") as fs: - json.dump( - sorted( - [flow.export for flow in flowmap.unmatched_target], - key=sorting_function, - ), - fs, - indent=True, - ) - - if format.value == "randonneur": - flowmap.to_randonneur( - source_id=source_id, - target_id=target_id, - contributors=contributors, - mapping_source=mapping_source, - mapping_target=mapping_target, - version=version, - licenses=licenses, - homepage=homepage, - name=name, - path=output_dir / f"{stem}.json", - ) - elif format.value == "glad": - flowmap.to_glad(output_dir / f"{stem}.xlsx", missing_source=True) - else: - flowmap.to_randonneur( - source_id=source_id, - target_id=target_id, - contributors=contributors, - mapping_source=mapping_source, - mapping_target=mapping_target, - version=version, - licenses=licenses, - homepage=homepage, - name=name, - path=output_dir / f"{stem}.json", - ) - flowmap.to_glad(output_dir / f"{stem}.xlsx", missing_source=True) - - return flowmap diff --git a/flowmapper/match.py b/flowmapper/match.py deleted file mode 100644 index 822b7d4..0000000 --- a/flowmapper/match.py +++ /dev/null @@ -1,245 +0,0 @@ -import logging - -from flowmapper.constants import RESOURCE_PARENT_CATEGORY -from flowmapper.flow import Flow -from flowmapper.utils import ( - ends_with_location, - location_reverser, - names_and_locations, - rm_parentheses_roman_numerals, - rm_roman_numerals_ionic_state, -) -from flowmapper.preferred_synonyms import ( - match_identical_names_in_preferred_synonyms, - match_identical_names_in_synonyms, -) - -logger = logging.getLogger(__name__) - - -def format_match_result(s: Flow, t: Flow, conversion_factor: float, match_info: dict): - return match_info | { - "source": s.export, - "target": t.export, - "conversion_factor": conversion_factor, - } - - -def match_identical_identifier(s: Flow, t: Flow, comment: str = "Identical identifier"): - if s.identifier and (s.identifier == t.identifier): - return {"comment": comment} - - -def match_identical_cas_numbers( - s: Flow, t: Flow, comment: str = "Identical CAS numbers" -): - if (s.cas == t.cas) and (s.context == t.context): - return {"comment": comment} - - -def match_identical_names(s: Flow, t: Flow, comment="Identical names"): - if (s.name == t.name) and (s.context == t.context): - return {"comment": comment} - - -def match_identical_names_without_commas( - s: Flow, t: Flow, comment="Identical names when commas removed" -): - if (s.name.normalized.replace(",", "") == t.name.normalized.replace(",", "")) and ( - s.context == t.context - ): - return {"comment": comment} - - -def match_resources_with_wrong_subcontext(s: Flow, t: Flow): - if ( - s.context.normalized[0].lower() in RESOURCE_PARENT_CATEGORY - and t.context.normalized[0].lower() in RESOURCE_PARENT_CATEGORY - and s.name == t.name - ): - return {"comment": "Resources with identical name but wrong subcontext"} - - -def match_identical_names_except_missing_suffix( - s: Flow, t: Flow, suffix: str, comment: str = "Identical names except missing suffix" -) -> dict: - if ( - (f"{s.name.normalized}, {suffix}" == t.name) - or (f"{t.name.normalized}, {suffix}" == s.name) - or (f"{s.name.normalized} {suffix}" == t.name) - or (f"{t.name.normalized} {suffix}" == s.name) - ) and s.context == t.context: - return {"comment": comment} - - -def match_names_with_roman_numerals_in_parentheses( - s: Flow, t: Flow, comment="With/without roman numerals in parentheses" -): - if ( - rm_parentheses_roman_numerals(s.name.normalized) - == rm_parentheses_roman_numerals(t.name.normalized) - and s.context == t.context - ): - return {"comment": comment} - - -def match_custom_names_with_location_codes( - s: Flow, t: Flow, comment="Custom names with location code" -): - """Matching which pulls out location codes but also allows for custom name transformations.""" - match = ends_with_location.search(s.name.normalized) - if match: - location = location_reverser[match.group("code")] - # Don't use replace, it will find e.g. ", fr" in "transformation, from" - name = s.name.normalized[: -len(match.group())] - try: - mapped_name = names_and_locations[name]["target"] - except KeyError: - return - if mapped_name == t.name.normalized and s.context == t.context: - result = {"comment": comment, "location": location} | names_and_locations[ - name - ].get("extra", {}) - if ( - s.name.normalized.startswith("water") - and s.unit.normalized == "cubic_meter" - and t.unit.normalized == "kilogram" - ): - result["conversion_factor"] = 1000 - elif ( - s.name.normalized.startswith("water") - and t.unit.normalized == "cubic_meter" - and s.unit.normalized == "kilogram" - ): - result["conversion_factor"] = 0.001 - return result - - -def match_names_with_location_codes( - s: Flow, t: Flow, comment="Name matching with location code" -): - match = ends_with_location.search(s.name.normalized) - if match: - location = location_reverser[match.group("code")] - name = s.name.normalized.replace(match.group(), "") - if name == t.name.normalized and s.context == t.context: - result = {"comment": comment, "location": location} - if ( - s.name.normalized.startswith("water") - and s.unit.normalized == "cubic_meter" - and t.unit.normalized == "kilogram" - ): - result["conversion_factor"] = 1000.0 - elif ( - s.name.normalized.startswith("water") - and t.unit.normalized == "cubic_meter" - and s.unit.normalized == "kilogram" - ): - result["conversion_factor"] = 0.001 - return result - - -def match_resource_names_with_location_codes_and_parent_context( - s: Flow, t: Flow, comment="Name matching with location code and parent context" -): - """Sometimes we have flows in a parent context,""" - match = ends_with_location.search(s.name.normalized) - if match: - location = location_reverser[match.group("code")] - name = s.name.normalized.replace(match.group(), "") - if ( - name == t.name.normalized - and s.context.normalized[0].lower() in RESOURCE_PARENT_CATEGORY - and t.context.normalized[0].lower() in RESOURCE_PARENT_CATEGORY - ): - result = {"comment": comment, "location": location} - if ( - s.name.normalized.startswith("water") - and s.unit.normalized == "cubic_meter" - and t.unit.normalized == "kilogram" - ): - result["conversion_factor"] = 1000.0 - elif ( - s.name.normalized.startswith("water") - and t.unit.normalized == "cubic_meter" - and s.unit.normalized == "kilogram" - ): - result["conversion_factor"] = 0.001 - return result - - -def match_non_ionic_state( - s: Flow, t: Flow, comment="Non-ionic state if no better match" -): - if ( - (rm_roman_numerals_ionic_state(s.name.normalized) == t.name) - or (rm_roman_numerals_ionic_state(s.name.normalized) + ", ion" == t.name) - ) and s.context == t.context: - return {"comment": comment} - - -def match_biogenic_to_non_fossil( - s: Flow, t: Flow, comment="Biogenic to non-fossil if no better match" -): - if ( - s.name.normalized.removesuffix(", biogenic") - == t.name.normalized.removesuffix(", non-fossil") - and s.context == t.context - ): - return {"comment": comment} - - -def match_resources_with_suffix_in_ground(s: Flow, t: Flow): - return match_identical_names_except_missing_suffix( - s, t, suffix="in ground", comment="Resources with suffix in ground" - ) - - -def match_flows_with_suffix_unspecified_origin(s: Flow, t: Flow): - return match_identical_names_except_missing_suffix( - s, - t, - suffix="unspecified origin", - comment="Flows with suffix unspecified origin", - ) - - -def match_resources_with_suffix_in_water(s: Flow, t: Flow): - return match_identical_names_except_missing_suffix( - s, t, suffix="in water", comment="Resources with suffix in water" - ) - - -def match_resources_with_suffix_in_air(s: Flow, t: Flow): - return match_identical_names_except_missing_suffix( - s, t, suffix="in air", comment="Resources with suffix in air" - ) - - -def match_emissions_with_suffix_ion(s: Flow, t: Flow): - return match_identical_names_except_missing_suffix( - s, t, suffix="ion", comment="Match emissions with suffix ion" - ) - - -def match_rules(): - return [ - match_identical_identifier, - match_identical_names, - match_identical_names_without_commas, - match_resources_with_suffix_in_ground, - match_resources_with_suffix_in_water, - match_resources_with_suffix_in_air, - match_flows_with_suffix_unspecified_origin, - match_resources_with_wrong_subcontext, - match_emissions_with_suffix_ion, - match_names_with_roman_numerals_in_parentheses, - match_names_with_location_codes, - match_resource_names_with_location_codes_and_parent_context, - match_custom_names_with_location_codes, - match_identical_cas_numbers, - match_non_ionic_state, - match_biogenic_to_non_fossil, - match_identical_names_in_preferred_synonyms, - match_identical_names_in_synonyms, - ] diff --git a/flowmapper/string_field.py b/flowmapper/string_field.py deleted file mode 100644 index c607eea..0000000 --- a/flowmapper/string_field.py +++ /dev/null @@ -1,43 +0,0 @@ -from typing import Any, Generic, TypeVar - -from flowmapper.utils import normalize_str - -SF = TypeVar("SF") - - -class StringField(Generic[SF]): - def __init__( - self, - original: str | None, - transformed: str | None = None, - use_lowercase: bool = True, - ): - self.original = original - self.normalized = normalize_str(transformed or original) - self.use_lowercase = use_lowercase - if self.use_lowercase: - self.normalized = self.normalized.lower() - - def __eq__(self, other: Any) -> bool: - if self.normalized == "": - return False - elif isinstance(other, StringField): - return ( - self.normalized == other.normalized or self.original == other.original - ) - elif isinstance(other, str): - if self.use_lowercase: - return self.normalized == other.lower() - else: - return self.normalized == other - else: - return False - - def __bool__(self) -> bool: - return bool(self.original) - - def __repr__(self) -> str: - if not self.original: - return "StringField with missing original value" - else: - return f"StringField: '{self.original}' -> '{self.normalized}'" diff --git a/flowmapper/string_list.py b/flowmapper/string_list.py deleted file mode 100644 index e76c021..0000000 --- a/flowmapper/string_list.py +++ /dev/null @@ -1,34 +0,0 @@ -from collections.abc import Collection, Iterable -from typing import Any, List - -from flowmapper.string_field import StringField - - -class StringList(Collection): - def __init__(self, original: List[str], transformed: List[str] | None = None): - transformed = transformed or original - if original is None: - self.data = [] - else: - self.data = [ - StringField(original=a, transformed=b) - for a, b in zip(original, transformed) - ] - - def __contains__(self, obj: Any) -> bool: - return any(obj == elem for elem in self.data) - - def __iter__(self) -> Iterable: - yield from self.data - - def __len__(self) -> int: - return len(self.data) - - def __bool__(self) -> bool: - return bool(self.data) - - def __repr__(self): - if self: - return "StringList: {}".format([repr(o) for o in self.data]) - else: - return "StringList: Empty" diff --git a/flowmapper/transformation_mapping.py b/flowmapper/transformation_mapping.py deleted file mode 100644 index 9b0c1a9..0000000 --- a/flowmapper/transformation_mapping.py +++ /dev/null @@ -1,50 +0,0 @@ -from collections import UserDict -from functools import partial -from typing import Any, List - -from flowmapper.context import ContextField -from flowmapper.string_field import StringField -from flowmapper.unit import UnitField - -ATTRIBUTE_MAPPING = { - "unit": partial(UnitField, use_lowercase=True), - "context": ContextField, - "identifier": partial(StringField, use_lowercase=True), -} - - -class ComparableFlowMapping(UserDict): - def __init__(self, initialdata: dict): - self.data = { - key: ATTRIBUTE_MAPPING.get(key, StringField)(value) - for key, value in initialdata.items() - } - - def __setitem__(self, key: Any, value: Any) -> None: - self.data[key] = ATTRIBUTE_MAPPING.get(key, StringField)(value) - - def __eq__(self, other: Any) -> bool: - return all(value == other.get(key) for key, value in self.data.items() if value) - - -def prepare_transformations(transformations: List[dict] | None) -> List[dict]: - if not transformations: - return [] - - prepared_transformations = [] - - for transformation_dataset in transformations: - for transformation_mapping in transformation_dataset.get("update", []): - transformation_mapping["source"] = ComparableFlowMapping( - transformation_mapping["source"] - ) - for other_dataset in prepared_transformations: - for other_mapping in other_dataset.get("update", []): - if other_mapping["source"] == transformation_mapping["source"]: - for key, value in other_mapping["target"].items(): - transformation_mapping["source"][key] = value - break - - prepared_transformations.append(transformation_dataset) - - return prepared_transformations diff --git a/flowmapper/unit.py b/flowmapper/unit.py deleted file mode 100644 index ca9ddf9..0000000 --- a/flowmapper/unit.py +++ /dev/null @@ -1,82 +0,0 @@ -import importlib.resources as resource -import math -from typing import Any, Generic, TypeVar - -from pint import UnitRegistry, errors - -from flowmapper.constants import PINT_MAPPING -from flowmapper.utils import normalize_str - -ureg = UnitRegistry() - -with resource.as_file(resource.files("flowmapper") / "data" / "units.txt") as filepath: - ureg.load_definitions(filepath) - -U = TypeVar("U") - - -class UnitField(Generic[U]): - def __init__( - self, original: str, transformed: str | None = None, use_lowercase: bool = False - ): - if transformed is None: - transformed = original - self.original = original - if self.is_uri(transformed): - # Private attribute, could change in future - self._glossary_entry = self.resolve_uri(transformed) - self.normalized = normalize_str(self._glossary_entry["label"]) - else: - self.normalized = normalize_str(transformed) - - self.use_lowercase = use_lowercase - if self.use_lowercase: - self.normalized = self.normalized.lower() - - # Private attribute, could change in future - self._pint_compatible = PINT_MAPPING.get(self.normalized, self.normalized) - - def is_uri(self, value: str) -> bool: - # Placeholder for when we support glossary entries - return False - - def resolve_uri(self, uri: str) -> None: - # Placeholder - pass - - def __repr__(self) -> str: - return f"UnitField: '{self.original}' -> '{self.normalized}'" - - def __bool__(self) -> bool: - return bool(self.original) - - def __eq__(self, other: Any): - if isinstance(other, UnitField): - return ( - self.normalized == other.normalized - or self.conversion_factor(other) == 1 - ) - elif isinstance(other, str) and self.use_lowercase: - return self.normalized == other.lower() - elif isinstance(other, str): - return self.normalized == other - else: - return False - - def compatible(self, other: Any): - if not isinstance(other, UnitField): - return False - else: - return math.isfinite(self.conversion_factor(other)) - - def conversion_factor(self, to: U | Any) -> float: - if self.normalized == to.normalized: - result = 1.0 - else: - try: - result = ( - ureg(self._pint_compatible).to(ureg(to._pint_compatible)).magnitude - ) - except (errors.DimensionalityError, errors.UndefinedUnitError): - result = float("nan") - return result diff --git a/flowmapper/utils.py b/flowmapper/utils.py deleted file mode 100644 index 9432041..0000000 --- a/flowmapper/utils.py +++ /dev/null @@ -1,167 +0,0 @@ -import copy -import hashlib -import importlib.resources as resource -import json -import re -import unicodedata -from collections.abc import Collection, Mapping -from pathlib import Path -from typing import Any, List, Union - -RESULTS_DIR = Path(__file__).parent / "manual_matching" / "results" - -with resource.as_file( - resource.files("flowmapper") / "data" / "places.json" -) as filepath: - places = json.load(open(filepath)) - -ends_with_location = re.compile( - ",[ \t\r\f]+(?P{})$".format( - "|".join([re.escape(string) for string in places]) - ), - re.IGNORECASE, -) -# All solutions I found for returning original string instead of -# lower case one were very ugly -location_reverser = {obj.lower(): obj for obj in places} -if len(location_reverser) != len(places): - raise ValueError("Multiple possible locations after lower case conversion") - -us_lci_ends_with_location = re.compile( - "/(?P{})$".format( - "|".join( - [ - re.escape(string) - for string in places - if 2 <= len(string) <= 3 and string.upper() == string - ] - ) - ), -) - -with resource.as_file( - resource.files("flowmapper") / "data" / "names_and_locations.json" -) as filepath: - names_and_locations = {o["source"]: o for o in json.load(open(filepath))} - - -def load_standard_transformations() -> List: - # with resource.as_file( - # resource.files("flowmapper") / "data" / "standard-units-harmonization.json" - # ) as filepath: - # units = json.load(open(filepath)) - with resource.as_file( - resource.files("flowmapper") / "data" / "simapro-2023-ecoinvent-3-contexts.json" - ) as filepath: - contexts = json.load(open(filepath)) - # return [units, contexts] - return [contexts] - - -def generate_flow_id(flow: dict): - flow_str = json.dumps(flow, sort_keys=True) - result = hashlib.md5(flow_str.encode("utf-8")).hexdigest() - return result - - -def read_migration_files(*filepaths: Union[str, Path]) -> List[dict]: - """ - Read and aggregate migration data from multiple JSON files. - - This function opens and reads a series of JSON files, each containing migration data as a list of dicts without the change type. - It aggregates all changes into a single list and returns it wrapped in a dictionary - under the change type 'update'. - - Parameters - ---------- - *filepaths : Path - Variable length argument list of Path objects. - - Returns - ------- - dict - A dictionary containing a single key 'update', which maps to a list. This list is - an aggregation of the data from all the JSON files read. - """ - migration_data = [] - - for filepath in filepaths: - if (RESULTS_DIR / filepath).is_file(): - filepath = RESULTS_DIR / filepath - with open(Path(filepath), "r") as fs: - migration_data.append(json.load(fs)) - - return migration_data - - -def rm_parentheses_roman_numerals(s: str): - pattern = r"\(\s*([ivxlcdm]+)\s*\)" - return re.sub(pattern, r"\1", s) - - -def rm_roman_numerals_ionic_state(s: str): - pattern = r"\s*\(\s*[ivxlcdm]+\s*\)$" - return re.sub(pattern, "", s) - - -def normalize_str(s): - if s is not None: - return unicodedata.normalize("NFC", s).strip() - else: - return "" - - -def transform_flow(flow, transformation): - result = copy.copy(flow) - result.update(transformation["target"]) - return result - - -def matcher(source, target): - return all(target.get(key) == value for key, value in source.items()) - - -def rowercase(obj: Any) -> Any: - """Recursively transform everything to lower case recursively""" - if isinstance(obj, str): - return obj.lower() - elif isinstance(obj, Mapping): - return type(obj)([(rowercase(k), rowercase(v)) for k, v in obj.items()]) - elif isinstance(obj, Collection): - return type(obj)([rowercase(o) for o in obj]) - else: - return obj - - -def match_sort_order(obj: dict) -> tuple: - return ( - not obj["from"].name, - obj["from"].name.normalized, - not obj["from"].context, - obj["from"].context.export_as_string(), - ) - - -def apply_transformations(obj: dict, transformations: List[dict] | None) -> dict: - if not transformations: - return obj - obj = copy.deepcopy(obj) - lower = rowercase(obj) - - for dataset in transformations: - for transformation_obj in dataset.get("create", []): - if matcher( - transformation_obj, - lower if dataset.get("case-insensitive") else obj, - ): - # Marked an needs to be created; missing in target list - obj["__missing__"] = True - break - for transformation_obj in dataset.get("update", []): - if transformation_obj["source"] == obj: - obj.update(transformation_obj["target"]) - if "conversion_factor" in transformation_obj: - obj["conversion_factor"] = transformation_obj["conversion_factor"] - break - - return obj diff --git a/pyproject.toml b/pyproject.toml index 2fb6a6d..7c64211 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,23 +20,26 @@ classifiers = [ "Development Status :: 4 - Beta", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", "Natural Language :: English", "Operating System :: OS Independent", "Topic :: Scientific/Engineering" ] -requires-python = ">=3.10" +requires-python = ">=3.11" dependencies = [ "bw_simapro_csv", "pandas[excel]", "pint", "pydantic", "pyecospold", - "randonneur>=0.6", - "randonneur_data", + "randonneur>=0.7.1", + "randonneur_data>=0.7.2", + "RapidFuzz", + "roman", + "structlog", "tqdm", "typer", "xmltodict", @@ -60,6 +63,7 @@ testing = [ dev = [ "build", "pre-commit", + "pyinstrument", "pylint", "pytest", "pytest-cov", @@ -72,7 +76,9 @@ flowmapper = "flowmapper.cli:app" [tool.setuptools] license-files = ["LICENSE"] include-package-data = true -packages = ["flowmapper", "flowmapper.extraction", "flowmapper.manual_matching"] + +[tool.setuptools.packages.find] +where = ["src"] [tool.setuptools.dynamic] version = {attr = "flowmapper.__version__"} @@ -87,7 +93,7 @@ norecursedirs = [ "build", ".tox" ] -testpaths = ["tests/*.py"] +testpaths = ["tests/**/*.py"] [tool.flake8] # Some sane defaults for the code style checker flake8 diff --git a/src/flowmapper/__init__.py b/src/flowmapper/__init__.py new file mode 100644 index 0000000..70a878b --- /dev/null +++ b/src/flowmapper/__init__.py @@ -0,0 +1,23 @@ +__all__ = ( + "__version__", + "CASField", + "ContextField", + "Flow", + "Flowmap", + "flowmapper", + "Match", + "MatchCondition", + "NormalizedFlow", + "UnitField", +) + +__version__ = "0.4.2" + +from flowmapper.domain.flow import Flow +from flowmapper.domain.match import Match +from flowmapper.domain.match_condition import MatchCondition +from flowmapper.domain.normalized_flow import NormalizedFlow +from flowmapper.fields import CASField, ContextField +from flowmapper.flowmap import Flowmap +from flowmapper.main import flowmapper +from flowmapper.unit import UnitField diff --git a/flowmapper/cli.py b/src/flowmapper/cli.py similarity index 57% rename from flowmapper/cli.py rename to src/flowmapper/cli.py index 8130edf..9775170 100644 --- a/flowmapper/cli.py +++ b/src/flowmapper/cli.py @@ -1,15 +1,23 @@ import importlib.metadata -import logging from pathlib import Path -from typing import Optional +from typing import Annotated +import structlog import typer -from typing_extensions import Annotated -from .extraction import ecospold2_biosphere_extractor, simapro_csv_biosphere_extractor -from .main import OutputFormat, flowmapper +from flowmapper.extraction import ( + ecospold2_biosphere_extractor, + simapro_csv_biosphere_extractor, +) +from flowmapper.main import flowmapper -logger = logging.getLogger(__name__) +try: + from pyinstrument import Profiler +except ImportError: + Profiler = None + + +logger = structlog.get_logger("flowmapper") app = typer.Typer() @@ -23,7 +31,7 @@ def version_callback(value: bool): @app.callback() def main( version: Annotated[ - Optional[bool], + bool | None, typer.Option("--version", callback=version_callback, is_eager=True), ] = None, ): @@ -34,20 +42,16 @@ def main( @app.command() def map( - source: Annotated[Path, typer.Argument(help="Path to source flowlist")], - target: Annotated[Path, typer.Argument(help="Path to target flowlist")], + source: Annotated[Path, typer.Argument(help="Path to source flow list")], + target: Annotated[Path, typer.Argument(help="Path to target flow list")], output_dir: Annotated[ Path, typer.Option(help="Directory to save mapping and diagnostics files") ] = Path("."), - format: Annotated[ - OutputFormat, - typer.Option(help="Mapping file output format", case_sensitive=False), - ] = "all", default_transformations: Annotated[ bool, typer.Option(help="Include default context and unit transformations?") ] = True, transformations: Annotated[ - Optional[list[Path]], + list[Path] | None, typer.Option( "--transformations", "-t", @@ -70,12 +74,45 @@ def map( bool, typer.Option(help="Write original target matched flows into separate file?"), ] = False, + profile: Annotated[ + bool, + typer.Option(help="Profile matching code with pyinstrument"), + ] = False, ): - return flowmapper( + # Default generic mapping for JSON flow lists + generic_mapping = { + "expression language": "JSONPath", + "labels": { + "name": "name", + "context": "context", + "unit": "unit", + "identifier": "identifier", + "cas_number": "cas_number", + "location": "location", + }, + } + + if profile: + if Profiler is None: + raise ImportError("`pyinstrument` not installed") + profiler = Profiler(interval=0.01) + profiler.start() + + result = flowmapper( source=source, target=target, + mapping_source=generic_mapping, + mapping_target=generic_mapping, + source_id=source.stem, + target_id=target.stem, + contributors=[ + { + "title": "flowmapper", + "roles": ["author"], + "path": "https://github.com/cmutel/flowmapper", + } + ], output_dir=output_dir, - format=format, default_transformations=default_transformations, transformations=transformations, unmatched_source=unmatched_source, @@ -84,17 +121,24 @@ def map( matched_target=matched_target, ) + if profile: + profiler.stop() + with open(f"{source.stem}-{target.stem}.html", "w") as f: + f.write(profiler.output_html()) + + return result + @app.command() def extract_simapro_csv( simapro_csv_filepath: Annotated[ - Path, typer.Argument(help="Path to source SimaPro CSV file") + Path, typer.Argument(help="Path to SimaPro CSV input file") ], - output_dir: Annotated[ - Path, typer.Argument(help="Directory to save mapping and diagnostics files") + output_filepath: Annotated[ + Path, typer.Argument(help="File path for JSON results data") ], ) -> None: - simapro_csv_biosphere_extractor(simapro_csv_filepath, output_dir) + simapro_csv_biosphere_extractor(simapro_csv_filepath, output_filepath) @app.command() @@ -102,8 +146,8 @@ def extract_ecospold2( elementary_exchanges_filepath: Annotated[ Path, typer.Argument(help="Path to source `ElementaryExchanges.xml` file") ], - output_dir: Annotated[ - Path, typer.Argument(help="Directory to save mapping and diagnostics files") + output_filepath: Annotated[ + Path, typer.Argument(help="File path for JSON results data") ], ) -> None: - ecospold2_biosphere_extractor(elementary_exchanges_filepath, output_dir) + ecospold2_biosphere_extractor(elementary_exchanges_filepath, output_filepath) diff --git a/flowmapper/constants.py b/src/flowmapper/constants.py similarity index 53% rename from flowmapper/constants.py rename to src/flowmapper/constants.py index 4499e9a..591c5c8 100644 --- a/flowmapper/constants.py +++ b/src/flowmapper/constants.py @@ -1,11 +1,11 @@ -PINT_MAPPING = { - "livestock unit": "livestock_unit", - "kilowatt hour": "kilowatt_hour", -} - RESOURCE_PARENT_CATEGORY = { "natural resources", "natural resource", "resources", "resource", + "land use", + "economic", + "social", + "raw materials", + "raw", } diff --git a/flowmapper/data/manual_name_match_simapro_ecoinvent_3.8.json b/src/flowmapper/data/manual_name_match_simapro_ecoinvent_3.8.json similarity index 100% rename from flowmapper/data/manual_name_match_simapro_ecoinvent_3.8.json rename to src/flowmapper/data/manual_name_match_simapro_ecoinvent_3.8.json diff --git a/flowmapper/data/manual_name_match_simapro_ecoinvent_3.9.json b/src/flowmapper/data/manual_name_match_simapro_ecoinvent_3.9.json similarity index 100% rename from flowmapper/data/manual_name_match_simapro_ecoinvent_3.9.json rename to src/flowmapper/data/manual_name_match_simapro_ecoinvent_3.9.json diff --git a/flowmapper/data/names_and_locations.json b/src/flowmapper/data/names_and_locations.json similarity index 100% rename from flowmapper/data/names_and_locations.json rename to src/flowmapper/data/names_and_locations.json diff --git a/flowmapper/data/places.json b/src/flowmapper/data/places.json similarity index 100% rename from flowmapper/data/places.json rename to src/flowmapper/data/places.json diff --git a/flowmapper/data/simapro-2023-ecoinvent-3-contexts.json b/src/flowmapper/data/simapro-2025-ecoinvent-3-contexts.json similarity index 51% rename from flowmapper/data/simapro-2023-ecoinvent-3-contexts.json rename to src/flowmapper/data/simapro-2025-ecoinvent-3-contexts.json index cd2a7d7..738c131 100644 --- a/flowmapper/data/simapro-2023-ecoinvent-3-contexts.json +++ b/src/flowmapper/data/simapro-2025-ecoinvent-3-contexts.json @@ -1,5 +1,5 @@ { - "name": "SimaPro-ecoinvent-3-context", + "name": "SimaPro-2025-ecoinvent-3.12-context", "licenses": [ { "name": "CC BY 4.0", @@ -8,17 +8,45 @@ } ], "version": "1.0.0", - "description": "Context mapping from 2023 SimaPro to ecoinvent 3", - "created": "2024-04-12T09:29:02.823409", + "description": "Context mapping from 2025 SimaPro to ecoinvent 3.12", "case-insensitive": true, + "created": "2025-11-10T12:34:56Z", "contributors": [ { "title": "Chris Mutel", "path": "https://chris.mutel.org/", - "role": "author" + "roles": ["author"] } ], + "graph_context": [ + "nodes" + ], + "mapping": { + "source": { + "expression language": "JSONPath", + "labels": { + "context": "$.context" + } + }, + "target": { + "expression language": "JSONPath", + "labels": { + "context": "$.context" + } + } + }, "update": [ + { + "source": { + "context": "air/unspecified" + }, + "target": { + "context": [ + "air", + "unspecified" + ] + } + }, { "source": { "context": "air/high. pop." @@ -76,17 +104,62 @@ }, { "source": { - "context": "emissions to air" + "context": "Emissions to air" + }, + "target": { + "context": [ + "air", + "unspecified" + ] + } + }, + { + "source": { + "context": "Emissions to air/" + }, + "target": { + "context": [ + "air", + "unspecified" + ] + } + }, + { + "source": { + "context": "Emissions to air/unspecified" + }, + "target": { + "context": [ + "air", + "unspecified" + ] + } + }, + { + "source": { + "context": "Airborne emissions/(unspecified)" + }, + "target": { + "context": [ + "air", + "unspecified" + ] + } + }, + { + "source": { + "context": "Emissions to air/high. pop." }, "target": { "context": [ - "air" + "air", + "urban air close to ground" ] } }, { "source": { - "context": "emissions to air/high. pop." + "context": "Airborne emissions/high. pop." }, "target": { "context": [ @@ -97,7 +170,40 @@ }, { "source": { - "context": "emissions to air/low. pop." + "context": "Emissions to air/low. pop." + }, + "target": { + "context": [ + "air", + "non-urban air or from high stacks" + ] + } + }, + { + "source": { + "context": "Airborne emissions/low. pop." + }, + "target": { + "context": [ + "air", + "non-urban air or from high stacks" + ] + } + }, + { + "source": { + "context": "Emissions to air/indoor" + }, + "target": { + "context": [ + "air", + "non-urban air or from high stacks" + ] + } + }, + { + "source": { + "context": "Airborne emissions/indoor" }, "target": { "context": [ @@ -108,7 +214,18 @@ }, { "source": { - "context": "emissions to air/low. pop., long-term" + "context": "Emissions to air/low. pop., long-term" + }, + "target": { + "context": [ + "air", + "low population density, long-term" + ] + } + }, + { + "source": { + "context": "Airborne emissions/low. pop., long-term" }, "target": { "context": [ @@ -119,7 +236,7 @@ }, { "source": { - "context": "emissions to air/stratosphere + troposphere" + "context": "Emissions to air/stratosphere + troposphere" }, "target": { "context": [ @@ -130,17 +247,62 @@ }, { "source": { - "context": "emissions to soil" + "context": "Airborne emissions/stratosphere + troposphere" + }, + "target": { + "context": [ + "air", + "lower stratosphere + upper troposphere" + ] + } + }, + { + "source": { + "context": "Emissions to soil/unspecified" + }, + "target": { + "context": [ + "soil", + "unspecified" + ] + } + }, + { + "source": { + "context": "Emissions to soil/(unspecified)" + }, + "target": { + "context": [ + "soil", + "unspecified" + ] + } + }, + { + "source": { + "context": "Emissions to soil" }, "target": { "context": [ - "soil" + "soil", + "unspecified" ] } }, { "source": { - "context": "emissions to soil/agricultural" + "context": "Emissions to soil/" + }, + "target": { + "context": [ + "soil", + "unspecified" + ] + } + }, + { + "source": { + "context": "Emissions to soil/agricultural" }, "target": { "context": [ @@ -151,7 +313,7 @@ }, { "source": { - "context": "emissions to soil/forestry" + "context": "Emissions to soil/forestry" }, "target": { "context": [ @@ -162,7 +324,7 @@ }, { "source": { - "context": "emissions to soil/industrial" + "context": "Emissions to soil/industrial" }, "target": { "context": [ @@ -171,6 +333,39 @@ ] } }, + { + "source": { + "context": "Final waste flows/(unspecified)" + }, + "target": { + "context": [ + "inventory indicator", + "waste" + ] + } + }, + { + "source": { + "context": "Final waste flows/" + }, + "target": { + "context": [ + "inventory indicator", + "waste" + ] + } + }, + { + "source": { + "context": "water/unspecified" + }, + "target": { + "context": [ + "water", + "unspecified" + ] + } + }, { "source": { "context": "water/groundwater" @@ -221,7 +416,41 @@ }, "target": { "context": [ - "water" + "water", + "unspecified" + ] + } + }, + { + "source": { + "context": "emissions to water/" + }, + "target": { + "context": [ + "water", + "unspecified" + ] + } + }, + { + "source": { + "context": "emissions to water/unspecified" + }, + "target": { + "context": [ + "water", + "unspecified" + ] + } + }, + { + "source": { + "context": "Waterborne emissions/(unspecified)" + }, + "target": { + "context": [ + "water", + "unspecified" ] } }, @@ -247,6 +476,17 @@ ] } }, + { + "source": { + "context": "Waterborne emissions/groundwater, long-term" + }, + "target": { + "context": [ + "water", + "ground-, long-term" + ] + } + }, { "source": { "context": "emissions to water/lake" @@ -269,6 +509,17 @@ ] } }, + { + "source": { + "context": "Waterborne emissions/ocean" + }, + "target": { + "context": [ + "water", + "ocean" + ] + } + }, { "source": { "context": "emissions to water/river" @@ -282,7 +533,7 @@ }, { "source": { - "context": "emissions to water/river, long-term" + "context": "Waterborne emissions/river" }, "target": { "context": [ @@ -293,11 +544,12 @@ }, { "source": { - "context": "resources" + "context": "emissions to water/river, long-term" }, "target": { "context": [ - "natural resource" + "water", + "surface water" ] } }, @@ -358,17 +610,40 @@ }, { "source": { - "context": "raw" + "context": "Raw" }, "target": { "context": [ "natural resource" ] - } + }, + "comment": "Dummy value used for matching; not a real ecoinvent context" }, { "source": { - "context": "raw/biotic" + "context": "Raw/" + }, + "target": { + "context": [ + "natural resource" + ] + }, + "comment": "Dummy value used for matching; not a real ecoinvent context" + }, + { + "source": { + "context": "Raw materials/" + }, + "target": { + "context": [ + "natural resource" + ] + }, + "comment": "Dummy value used for matching; not a real ecoinvent context" + }, + { + "source": { + "context": "Raw/biotic" }, "target": { "context": [ @@ -379,7 +654,7 @@ }, { "source": { - "context": "raw/in air" + "context": "Raw/in air" }, "target": { "context": [ @@ -390,7 +665,7 @@ }, { "source": { - "context": "raw/in ground" + "context": "Raw/in ground" }, "target": { "context": [ @@ -401,7 +676,7 @@ }, { "source": { - "context": "raw/land" + "context": "Raw/land" }, "target": { "context": [ @@ -409,9 +684,21 @@ "land" ] } - }, { + }, + { + "source": { + "context": "Raw materials/land" + }, + "target": { + "context": [ + "natural resource", + "land" + ] + } + }, + { "source": { - "context": "raw/in water" + "context": "Raw/in water" }, "target": { "context": [ diff --git a/flowmapper/data/standard-units-harmonization.json b/src/flowmapper/data/standard-units-harmonization.json similarity index 73% rename from flowmapper/data/standard-units-harmonization.json rename to src/flowmapper/data/standard-units-harmonization.json index 2b5de70..caa8d44 100644 --- a/flowmapper/data/standard-units-harmonization.json +++ b/src/flowmapper/data/standard-units-harmonization.json @@ -1,5 +1,5 @@ { - "name": "Standard-units-harmonization", + "name": "Flowmapper-standard-units-harmonization", "licenses": [ { "name": "CC BY 4.0", @@ -8,16 +8,33 @@ } ], "version": "1.0.0", - "description": "Standard Brightway unit mapping", - "created": "2024-04-12T09:29:02.823409", + "description": "Standard flowmapper unit harmonization linked with Pint customization", + "created": "2025-11-10T12:34:56Z", "case-insensitive": true, "contributors": [ { "title": "Chris Mutel", "path": "https://chris.mutel.org/", - "role": "author" + "roles": ["author"] } ], + "graph_context": [ + "nodes" + ], + "mapping": { + "source": { + "expression language": "JSONPath", + "labels": { + "unit": "$.unit" + } + }, + "target": { + "expression language": "JSONPath", + "labels": { + "unit": "$.unit" + } + } + }, "update": [ { "source": { @@ -51,6 +68,14 @@ "unit": "gigajoule" } }, + { + "source": { + "unit": "GJ" + }, + "target": { + "unit": "gigajoule" + } + }, { "source": { "unit": "h" @@ -83,6 +108,30 @@ "unit": "kilobecquerel" } }, + { + "source": { + "unit": "livestock unit" + }, + "target": { + "unit": "livestock_unit" + } + }, + { + "source": { + "unit": "kilowatt hour" + }, + "target": { + "unit": "kilowatt_hour" + } + }, + { + "source": { + "unit": "kBq" + }, + "target": { + "unit": "kilobecquerel" + } + }, { "source": { "unit": "kilo becquerel" @@ -123,6 +172,14 @@ "unit": "kilojoule" } }, + { + "source": { + "unit": "kJ" + }, + "target": { + "unit": "kilojoule" + } + }, { "source": { "unit": "kwh" @@ -187,6 +244,14 @@ "unit": "square_meter_year" } }, + { + "source": { + "unit": "cubic meter-year" + }, + "target": { + "unit": "cubic_meter_year" + } + }, { "source": { "unit": "m2a" @@ -275,6 +340,14 @@ "unit": "megajoule" } }, + { + "source": { + "unit": "MJ" + }, + "target": { + "unit": "megajoule" + } + }, { "source": { "unit": "my" @@ -283,6 +356,14 @@ "unit": "meter_year" } }, + { + "source": { + "unit": "standard cubic meter" + }, + "target": { + "unit": "standard_cubic_meter" + } + }, { "source": { "unit": "sm3" @@ -291,6 +372,14 @@ "unit": "standard_cubic_meter" } }, + { + "source": { + "unit": "normal cubic meter" + }, + "target": { + "unit": "normal_cubic_meter" + } + }, { "source": { "unit": "nm3" @@ -299,6 +388,22 @@ "unit": "normal_cubic_meter" } }, + { + "source": { + "unit": "sM3" + }, + "target": { + "unit": "standard_cubic_meter" + } + }, + { + "source": { + "unit": "nM3" + }, + "target": { + "unit": "normal_cubic_meter" + } + }, { "source": { "unit": "p" @@ -370,6 +475,14 @@ "target": { "unit": "watt_hour" } + }, + { + "source": { + "unit": "eur2005" + }, + "target": { + "unit": "eur_2005" + } } ] } diff --git a/flowmapper/data/units.txt b/src/flowmapper/data/units.txt similarity index 54% rename from flowmapper/data/units.txt rename to src/flowmapper/data/units.txt index ff6e03a..d9b14e3 100644 --- a/flowmapper/data/units.txt +++ b/src/flowmapper/data/units.txt @@ -9,9 +9,15 @@ square_meter_year = m2 * year = m2y = m2a cubic_meter_year = m3 * year = m3y = m3a # Gas volume at given conditions -[gas_volume] = [pressure] * [volume] -standard_cubic_meter = atmosphere * (meter ** 3) = sm3 -normal_cubic_meter = 1.0732 * standard_cubic_meter = nm3 +# https://en.wikipedia.org/wiki/Standard_temperature_and_pressure +# 273.15 K (0 °C) and an absolute pressure of exactly 1 bar (100 kPa) +standard_cubic_meter = 44.095 * mole = sm3 +# https://www.sciencedirect.com/topics/engineering/cubic-metre +# There are multiple definitions for this but as we only care about natural gas, using the +# Gas Industry Standards Board seems reasonable. +# 288.15 K (15 °C) and an absolute pressure of exactly 1 atm (101.325 kPa) +# See also https://github.com/qudt/qudt-public-repo/issues/1227 +normal_cubic_meter = 41.739 * mole = nm3 # Livestock livestock_unit = [livestock] = LU @@ -31,3 +37,7 @@ vehicle_kilometer = vehicle * kilometer = vkm # Personal travel person = [personal_travel] person_kilometer = person * kilometer = pkm + +# Currency +eur = [currency] +eur_2005 = 1 * eur diff --git a/src/flowmapper/domain/__init__.py b/src/flowmapper/domain/__init__.py new file mode 100644 index 0000000..1cb2e68 --- /dev/null +++ b/src/flowmapper/domain/__init__.py @@ -0,0 +1,15 @@ +"""Domain entities for flowmapper. + +This package contains the core domain model classes: +- Flow: Represents an elementary flow with all its attributes +- NormalizedFlow: Manages flow transformations and matching state +- Match: Represents a mapping between source and target flows +- MatchCondition: Enumeration of match quality levels +""" + +from flowmapper.domain.flow import Flow +from flowmapper.domain.match import Match +from flowmapper.domain.match_condition import MatchCondition +from flowmapper.domain.normalized_flow import NormalizedFlow + +__all__ = ["Flow", "NormalizedFlow", "Match", "MatchCondition"] diff --git a/src/flowmapper/domain/flow.py b/src/flowmapper/domain/flow.py new file mode 100644 index 0000000..4a65200 --- /dev/null +++ b/src/flowmapper/domain/flow.py @@ -0,0 +1,337 @@ +"""Flow class representing an elementary flow with all its attributes.""" + +import itertools +import uuid +from dataclasses import dataclass, field +from typing import Any, Self + +from flowmapper.errors import MissingLocation +from flowmapper.fields import ( + CASField, + ContextField, + OxidationState, + StringField, + replace_location_suffix, + split_location_suffix, +) +from flowmapper.unit import UnitField +from flowmapper.utils import remove_unit_slash + +global_counter = itertools.count(0) + + +@dataclass(frozen=True) +class Flow: + """ + Represents an elementary flow with all its attributes. + + A Flow is an immutable dataclass that represents an elementary flow (e.g., a substance + or material) with its name, unit, context, and optional attributes like location, + CAS number, and synonyms. Flows can be normalized to a standard form for matching + and comparison. + + Attributes + ---------- + name : StringField + The name of the flow (e.g., "Carbon dioxide"). + unit : UnitField + The unit of measurement (e.g., "kg", "m3"). + context : ContextField + The context or category of the flow (e.g., "air", "water"). + identifier : str | None, optional + An optional unique identifier for the flow. + location : str | None, optional + An optional location code (e.g., "NL", "DE", "US"). + oxidation_state : OxidationState | None, optional + The oxidation state of the flow if applicable. + cas_number : CASField | None, optional + The CAS (Chemical Abstracts Service) registry number. + synonyms : list[str], default=[] + A list of alternative names for the flow. + _id : int + Internal unique identifier (auto-generated). + + Examples + -------- + >>> flow = Flow.from_dict({ + ... "name": "Carbon dioxide", + ... "context": "air", + ... "unit": "kg" + ... }) + >>> normalized = flow.normalize() + >>> print(normalized.name.data) + carbon dioxide + """ + + name: StringField + unit: UnitField + context: ContextField + identifier: str | None = None # Internal ID, not necessarily present or unique... + location: str | None = None + oxidation_state: OxidationState | None = None + cas_number: CASField | None = None + synonyms: list[str] = field(default_factory=lambda: []) + conversion_factor: float | None = None + _id: int = field(default_factory=lambda: next(global_counter)) + + @staticmethod + def randonneur_mapping() -> dict: + """ + Return the randonneur mapping configuration for Flow objects. + + Returns + ------- + dict + A dictionary containing JSONPath expressions for mapping Flow attributes + to randonneur transformation format. + """ + return { + "expression language": "JSONPath", + "labels": { + "unit": "$.unit", + "name": "$.name", + "context": "$.context", + "identifier": "$.identifier", + "location": "$.location", + "cas_number": "$.cas_number", + "synonyms": "$.synonyms", + "conversion_factor": "$.conversion_factor", + }, + } + + @classmethod + def from_dict(cls, data: dict) -> Self: + """ + Create a Flow instance from a dictionary. + + Parameters + ---------- + data : dict + Dictionary containing flow data with keys: name, unit, context, and + optionally identifier, location, oxidation_state, cas_number, synonyms. + + Returns + ------- + Flow + A new Flow instance created from the dictionary data. + + Examples + -------- + >>> flow = Flow.from_dict({ + ... "name": "Carbon dioxide", + ... "context": "air", + ... "unit": "kg", + ... "location": "NL" + ... }) + """ + return cls( + name=StringField(data["name"]), + unit=UnitField(data["unit"]), + context=ContextField(data["context"]), + identifier=data.get("identifier"), + location=data.get("location") or None, + oxidation_state=( + OxidationState(data["oxidation_state"]) + if data.get("oxidation_state") + else None + ), + cas_number=CASField.from_string(data.get("cas_number") or None), + synonyms=data.get("synonyms") or [], + conversion_factor=data.get("conversion_factor"), + ) + + def to_dict(self) -> dict: + """ + Convert the Flow to a dictionary representation. + + Returns + ------- + dict + Dictionary containing the flow's data. Only non-None optional fields + are included. + + Examples + -------- + >>> flow = Flow.from_dict({"name": "CO2", "context": "air", "unit": "kg"}) + >>> flow.to_dict() + {'name': 'CO2', 'unit': 'kg', 'context': ('air',), 'identifier': None} + """ + data = { + "name": self.name.data, + "unit": self.unit.data, + "context": self.context.as_tuple(), + "identifier": self.identifier, + } + for key in ( + "location", + "oxidation_state", + "cas_number", + "synonyms", + "conversion_factor", + ): + if getattr(self, key): + data[key] = getattr(self, key) + return data + + def normalize(self) -> Self: + """ + Normalize the flow to a standard form for matching. + + This method performs several normalization steps: + 1. Removes unit references from the name (e.g., "/kg") + 2. Extracts location from the name suffix (e.g., ", NL") + 3. Extracts oxidation state from the name (e.g., "Iron(II)") + 4. Normalizes the name, unit, and context fields + + Returns + ------- + Flow + A new Flow instance with normalized attributes. + + Examples + -------- + >>> flow = Flow.from_dict({ + ... "name": "Carbon dioxide, NL", + ... "context": "air", + ... "unit": "kg" + ... }) + >>> normalized = flow.normalize() + >>> normalized.location + 'NL' + """ + location, oxidation_state = self.location, self.oxidation_state + name = remove_unit_slash(self) + name, other_location = split_location_suffix(name) + if other_location: + location = other_location + if OxidationState.has_oxidation_state(name): + oxidation_state, name = OxidationState.from_string(name) + + return type(self)( + identifier=self.identifier, + name=StringField(name).normalize(), + location=location, + oxidation_state=oxidation_state, + unit=self.unit.normalize(), + context=self.context.normalize(), + cas_number=self.cas_number, + synonyms=self.synonyms, + conversion_factor=self.conversion_factor, + ) + + def copy_with_new_location(self, location: str) -> Self: + """ + Create a copy of the flow with a new location in the name. + + This method replaces the location suffix in the flow's name with a new + location value. If no location suffix is found, it appends the location + to the name. The original flow is not modified. + + The new flow will have a new UUID identifier, regardless of whether + the original flow had an identifier. + + Parameters + ---------- + location : str + The new location code to use (e.g., "DE", "FR"). + + Returns + ------- + Flow + A new Flow instance with the updated location in the name and a new + UUID identifier. + + Notes + ----- + - If the flow's name contains a location suffix (matched by the + ends_with_location regex), it is replaced with the new location. + - If no location suffix is found, the location is appended to the name + in the format ", ". + - The new flow always gets a new UUID identifier via `uuid.uuid4()`. + + Examples + -------- + >>> flow = Flow.from_dict({ + ... "name": "Carbon dioxide, NL", + ... "context": "air", + ... "unit": "kg" + ... }) + >>> new_flow = flow.copy_with_new_location("DE") + >>> new_flow.name.data + 'Carbon dioxide, DE' + >>> new_flow.identifier != flow.identifier + True + >>> # If no location suffix exists, location is appended + >>> flow2 = Flow.from_dict({ + ... "name": "Carbon dioxide", + ... "context": "air", + ... "unit": "kg" + ... }) + >>> new_flow2 = flow2.copy_with_new_location("DE") + >>> new_flow2.name.data + 'Carbon dioxide, DE' + """ + if not location: + raise ValueError("No location parameter given") + + data = self.to_dict() + try: + data["name"] = replace_location_suffix( + string=data["name"], new_location=location + ) + except MissingLocation: + data["name"] = ( + data["name"].strip() + + (", " if not data["name"].endswith(",") else " ") + + location + ) + data["identifier"] = str(uuid.uuid4()) + return type(self).from_dict(data) + + def __repr__(self) -> str: + """Return a string representation showing all non-None attributes.""" + parts = [ + f"name={self.name!r}", + f"unit={self.unit!r}", + f"context={self.context!r}", + ] + if self.identifier is not None: + parts.append(f"identifier={self.identifier!r}") + if self.location is not None: + parts.append(f"location={self.location!r}") + if self.oxidation_state is not None: + parts.append(f"oxidation_state={self.oxidation_state!r}") + if self.cas_number is not None: + parts.append(f"cas_number={self.cas_number!r}") + if self.synonyms: + parts.append(f"synonyms={self.synonyms!r}") + if self.conversion_factor: + parts.append(f"conversion_factor={self.conversion_factor!r}") + return f"Flow({', '.join(parts)})" + + def __eq__(self, other: Any) -> bool: + """Check equality based on internal _id.""" + if not isinstance(other, Flow): + return False + return self._id == other._id + + def __lt__(self, other: Self) -> bool: + """ + Compare flows for sorting. + + Flows are compared by name, unit, context, and identifier in that order. + """ + if not isinstance(other, Flow): + return False + else: + return ( + self.name.data, + self.unit.data, + self.context.value, + self.identifier, + ) < ( + other.name.data, + other.unit.data, + other.context.value, + other.identifier, + ) diff --git a/src/flowmapper/domain/match.py b/src/flowmapper/domain/match.py new file mode 100644 index 0000000..1542cf5 --- /dev/null +++ b/src/flowmapper/domain/match.py @@ -0,0 +1,149 @@ +"""Match class representing a mapping between source and target flows.""" + +from __future__ import annotations + +from collections import UserString +from dataclasses import asdict, dataclass, field +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from flowmapper.domain.flow import Flow + from flowmapper.domain.match_condition import MatchCondition + + +@dataclass +class Match: + """ + Represents a match between a source flow and a target flow. + + A Match object contains information about how a source flow maps to a target + flow, including the match quality (condition), conversion factor, and metadata + about how the match was found. + + Attributes + ---------- + source : Flow + The source flow being matched. + target : Flow + The target flow that the source maps to. + function_name : str + The name of the matching function that found this match. + condition : MatchCondition + The quality/type of the match (exact, close, related, etc.). + conversion_factor : float, default=1.0 + The factor to convert from source unit to target unit. + comment : str, default="" + Optional comment describing the match. + new_target_flow : bool, default=False + Whether this match created a new target flow that didn't exist before. + + Examples + -------- + >>> from flowmapper.domain.flow import Flow + >>> from flowmapper.domain.match import Match + >>> from flowmapper.domain.match_condition import MatchCondition + >>> source = Flow.from_dict({ + ... "name": "Carbon dioxide", + ... "context": "air", + ... "unit": "kg" + ... }) + >>> target = Flow.from_dict({ + ... "name": "Carbon dioxide", + ... "context": "air", + ... "unit": "kg" + ... }) + >>> match = Match( + ... source=source, + ... target=target, + ... function_name="match_identical_names", + ... condition=MatchCondition.exact + ... ) + >>> match.export() + {'source': {...}, 'target': {...}, 'condition': '...', ...} + """ + + source: Flow + target: Flow + function_name: str + condition: MatchCondition + conversion_factor: float = 1.0 + comment: str = field(default_factory=lambda: "") + new_target_flow: bool = False + + def export(self, flowmapper_metadata: bool = False) -> dict: + """ + Export the match to a dictionary format. + + This method serializes the match to a dictionary suitable for JSON + export or storage. The source and target flows are converted to + dictionaries, and special objects are serialized appropriately. + + Parameters + ---------- + flowmapper_metadata : bool, default=False + If True, include flowmapper-specific metadata (version, function_name) + in the export. + + Returns + ------- + dict + Dictionary containing the match data, with source and target as + dictionaries and condition as a string URI. + + Examples + -------- + >>> match.export() + {'source': {...}, 'target': {...}, 'condition': '...', ...} + >>> match.export(flowmapper_metadata=True) + {'source': {...}, 'target': {...}, 'condition': '...', 'flowmapper_metadata': {...}, ...} + """ + from flowmapper import __version__ + from flowmapper.fields import ContextField + + def serializable(obj: Any) -> Any: + if isinstance(obj, UserString): + return str(obj) + elif isinstance(obj, ContextField): + return obj.value + return obj + + data = asdict(self) + data["source"] = { + k: serializable(v) + for k, v in data["source"].items() + if v and not k.startswith("_") + } + data["target"] = { + k: serializable(v) + for k, v in data["target"].items() + if v and not k.startswith("_") + } + data["condition"] = str(data["condition"]) + + function_name = data.pop("function_name") + if flowmapper_metadata: + data["flowmapper_metadata"] = { + "version": __version__, + "function_name": function_name, + } + + return data + + def __lt__(self, other: Match) -> bool: + """ + Compare matches for sorting. + + Matches are sorted by source name, source context, target name, + and target context in that order. + """ + return ( + self.source.name, + self.source.context, + self.target.name, + self.target.context, + ) < ( + other.source.name, + other.source.context, + other.target.name, + other.target.context, + ) diff --git a/src/flowmapper/domain/match_condition.py b/src/flowmapper/domain/match_condition.py new file mode 100644 index 0000000..b24d2f0 --- /dev/null +++ b/src/flowmapper/domain/match_condition.py @@ -0,0 +1,83 @@ +"""MatchCondition enum for representing match quality levels.""" + +from enum import StrEnum + + +class MatchCondition(StrEnum): + """ + Enumeration of match quality conditions based on SKOS vocabulary. + + Match conditions represent the semantic relationship between source and target + flows in a mapping. They follow the SKOS (Simple Knowledge Organization System) + vocabulary for concept matching. + + Attributes + ---------- + exact : str + Exact match - the flows are semantically identical. + SKOS URI: http://www.w3.org/2004/02/skos/core#exactMatch + close : str + Close match - the flows are very similar but not identical. + SKOS URI: http://www.w3.org/2004/02/skos/core#closeMatch + related : str + Related match - the flows are related but not equivalent. + SKOS URI: http://www.w3.org/2004/02/skos/core#relatedMatch + narrow : str + Narrow match - the target is more specific than the source. + SKOS URI: http://www.w3.org/2004/02/skos/core#narrowMatch + broad : str + Broad match - the target is more general than the source. + SKOS URI: http://www.w3.org/2004/02/skos/core#broadMatch + + Examples + -------- + >>> condition = MatchCondition.exact + >>> condition.as_glad() + '=' + >>> condition = MatchCondition.related + >>> condition.as_glad() + '~' + """ + + exact = "http://www.w3.org/2004/02/skos/core#exactMatch" + close = "http://www.w3.org/2004/02/skos/core#closeMatch" + related = "http://www.w3.org/2004/02/skos/core#relatedMatch" + # A triple skos:broader asserts that , the object of the triple, is a broader concept + # than , the subject of the triple. + narrow = "http://www.w3.org/2004/02/skos/core#narrowMatch" # in SKOS the *target* is narrower than the *source* + broad = "http://www.w3.org/2004/02/skos/core#broadMatch" # in SKOS the *target* is broader than the *source* + + def as_glad(self) -> str: + """ + Convert match condition to GLAD format symbol. + + GLAD (Global LCA Data Access) network uses single-character symbols + to represent match conditions in flow mappings. + + Returns + ------- + str + Single character symbol: + - "=" for exact match + - "~" for close or related match + - ">" for narrow match + - "<" for broad match + + Examples + -------- + >>> MatchCondition.exact.as_glad() + '=' + >>> MatchCondition.related.as_glad() + '~' + """ + if self.value == "http://www.w3.org/2004/02/skos/core#exactMatch": + return "=" + elif self.value == "http://www.w3.org/2004/02/skos/core#closeMatch": + return "~" + elif self.value == "http://www.w3.org/2004/02/skos/core#relatedMatch": + return "~" + elif self.value == "http://www.w3.org/2004/02/skos/core#narrowMatch": + return ">" + elif self.value == "http://www.w3.org/2004/02/skos/core#broadMatch": + return "<" + raise ValueError # Just for silly type checking diff --git a/src/flowmapper/domain/normalized_flow.py b/src/flowmapper/domain/normalized_flow.py new file mode 100644 index 0000000..6c925ca --- /dev/null +++ b/src/flowmapper/domain/normalized_flow.py @@ -0,0 +1,239 @@ +"""NormalizedFlow class for managing flow transformations and matching state.""" + +from __future__ import annotations + +from copy import copy +from dataclasses import dataclass +from typing import TYPE_CHECKING, Self + +if TYPE_CHECKING: + from flowmapper.domain.flow import Flow + + +@dataclass +class NormalizedFlow: + """ + Represents a flow with its original, normalized, and current states. + + NormalizedFlow tracks a flow through its lifecycle: + - `original`: The flow as it was initially created + - `normalized`: The flow after normalization (standard form for matching) + - `current`: The current state (can be modified for transformations) + + This class is used for matching flows where transformations may be temporarily + applied to the `current` state, then reset back to `normalized`. + + Attributes + ---------- + original : Flow + The original flow as created from source data. + normalized : Flow + The normalized version of the flow (standard form). + current : Flow + The current state of the flow (can be modified). + matched : bool, default=False + Whether this flow has been matched to a target flow. + + Examples + -------- + >>> from flowmapper.domain.flow import Flow + >>> from flowmapper.domain.normalized_flow import NormalizedFlow + >>> flow = Flow.from_dict({ + ... "name": "Carbon dioxide", + ... "context": "air", + ... "unit": "kg" + ... }) + >>> normalized = flow.normalize() + >>> nf = NormalizedFlow( + ... original=flow, + ... normalized=normalized, + ... current=copy(normalized) + ... ) + >>> nf.update_current(name="Modified") + >>> nf.reset_current() # Reset to normalized state + """ + + original: Flow + normalized: Flow + current: Flow + matched: bool = False + + @property + def name(self) -> str: + """Return the current flow's name.""" + return self.current.name.data + + @property + def unit(self) -> str: + """Return the current flow's unit.""" + return self.current.unit.data + + @property + def context(self) -> str | list[str] | tuple[str]: + """Return the current flow's context.""" + return self.current.context.value + + @property + def identifier(self) -> str | None: + """Return the current flow's identifier.""" + return self.current.identifier + + @property + def location(self) -> str | None: + """Return the current flow's location.""" + return self.current.location + + @property + def oxidation_state(self) -> int | None: + """Return the current flow's oxidation state value.""" + return ( + self.current.oxidation_state.value if self.current.oxidation_state else None + ) + + @property + def cas_number(self) -> str | None: + """Return the current flow's CAS number.""" + return self.current.cas_number.data if self.current.cas_number else None + + @property + def synonyms(self) -> list[str] | None: + """Return the current flow's synonyms.""" + return self.current.synonyms + + @property + def id(self) -> int: + """Return the original flow's internal ID.""" + return self.original._id + + def reset_current(self) -> None: + """ + Reset the current flow to the normalized state. + + This method creates a copy of the normalized flow and sets it as the + current flow. Useful after applying temporary transformations. + """ + self.current = copy(self.normalized) + + def update_current(self, **kwargs) -> None: + """ + Update the current flow with new attribute values. + + This method creates a new Flow based on the normalized flow's data, + updated with the provided keyword arguments. The normalized flow + remains unchanged. + + Parameters + ---------- + **kwargs + Keyword arguments corresponding to Flow attributes (name, unit, + context, location, etc.). + + Examples + -------- + >>> nf.update_current(name="Modified name", unit="g") + """ + from flowmapper.domain.flow import Flow + + data = self.normalized.to_dict() + data.update(kwargs) + self.current = Flow.from_dict(data) + + @staticmethod + def from_dict(data: dict) -> NormalizedFlow: + """ + Create a NormalizedFlow from a dictionary. + + This method creates the original flow, normalizes it, and sets up + the NormalizedFlow with all three states. + + Parameters + ---------- + data : dict + Dictionary containing flow data. + + Returns + ------- + NormalizedFlow + A new NormalizedFlow instance. + """ + from flowmapper.domain.flow import Flow + + original = Flow.from_dict(data) + # Do data preprocessing here + normalized = original.normalize() + return NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + def unit_compatible(self, other: Self) -> bool: + """ + Check if this flow's unit is compatible with another flow's unit. + + Parameters + ---------- + other : NormalizedFlow + Another NormalizedFlow to compare units with. + + Returns + ------- + bool + True if the units are compatible (can be converted), False otherwise. + """ + return self.current.unit.compatible(other.current.unit) + + def conversion_factor(self, other: Self) -> float: + """ + Calculate the conversion factor from this flow's unit to another flow's unit. + + Parameters + ---------- + other : NormalizedFlow + Another NormalizedFlow to convert to. + + Returns + ------- + float + The conversion factor to multiply this flow's value by to get the + equivalent value in the other flow's unit. + """ + from_transformation = self.current.conversion_factor or 1.0 + return from_transformation * self.current.unit.conversion_factor( + other.current.unit + ) + + def export(self) -> dict: + """ + Export the flow data for serialization. + + Returns a dictionary containing the original flow's data, suitable + for JSON serialization or export to external formats. + + Returns + ------- + dict + Dictionary containing flow data with only non-None values. + """ + data = [ + ("name", self.original.name.data), + ("unit", self.original.unit.data), + ("context", self.original.context.value), + ("identifier", self.original.identifier), + ("location", self.original.location), + ( + "cas_number", + ( + self.normalized.cas_number.export() + if self.normalized.cas_number + else None + ), + ), + ] + return {k: v for k, v in data if v} + + def __repr__(self) -> str: + """Return a string representation showing non-None attributes of original and current.""" + return f"""NormalizedFlow( + original={self.original!r} + current={self.current!r} + matched={self.matched!r} +)""" diff --git a/flowmapper/errors.py b/src/flowmapper/errors.py similarity index 66% rename from flowmapper/errors.py rename to src/flowmapper/errors.py index 6f512ca..e0e7d13 100644 --- a/flowmapper/errors.py +++ b/src/flowmapper/errors.py @@ -4,3 +4,7 @@ class DifferingMatches(Exception): class DifferingConversions(Exception): """Multiple, different conversion factors provided for a given match""" + + +class MissingLocation(Exception): + """Expected a location element in a name, but didn't find any""" diff --git a/src/flowmapper/extraction/__init__.py b/src/flowmapper/extraction/__init__.py new file mode 100644 index 0000000..ca59247 --- /dev/null +++ b/src/flowmapper/extraction/__init__.py @@ -0,0 +1,6 @@ +# from flowmapper.extraction.ecoinvent import ecoinvent_biosphere_extractor +from flowmapper.extraction.ecospold2 import ecospold2_biosphere_extractor +from flowmapper.extraction.simapro_csv import simapro_csv_biosphere_extractor +from flowmapper.extraction.simapro_ecospold1 import ( + simapro_ecospold1_biosphere_extractor, +) diff --git a/flowmapper/extraction/ecospold2.py b/src/flowmapper/extraction/ecospold2.py similarity index 92% rename from flowmapper/extraction/ecospold2.py rename to src/flowmapper/extraction/ecospold2.py index 7a2ec9f..9489a84 100644 --- a/flowmapper/extraction/ecospold2.py +++ b/src/flowmapper/extraction/ecospold2.py @@ -20,7 +20,7 @@ def reformat(obj: dict) -> dict: elif obj.get("synonym") and "#text" in obj["synonym"]: data["synonyms"] = [obj["synonym"]["#text"]] if "@casNumber" in obj: - data["CAS number"] = obj["@casNumber"] + data["cas_number"] = obj["@casNumber"] return data @@ -43,7 +43,9 @@ def remove_conflicting_synonyms(data: list[dict]) -> list[dict]: if not (obj.get("synonyms") and obj.get("context")): continue obj["synonyms"] = [ - syn for syn in obj["synonyms"] if syn.lower() not in base_names[obj["context"][0]] + syn + for syn in obj["synonyms"] + if syn.lower() not in base_names[obj["context"][0]] ] return data diff --git a/flowmapper/extraction/simapro_csv.py b/src/flowmapper/extraction/simapro_csv.py similarity index 81% rename from flowmapper/extraction/simapro_csv.py rename to src/flowmapper/extraction/simapro_csv.py index 64a0313..d6bd383 100644 --- a/flowmapper/extraction/simapro_csv.py +++ b/src/flowmapper/extraction/simapro_csv.py @@ -2,7 +2,9 @@ from pathlib import Path import bw_simapro_csv -from loguru import logger +import structlog + +logger = structlog.get_logger("filemapper") def is_simapro_csv_file(fp: Path) -> bool: @@ -14,7 +16,9 @@ def is_simapro_csv_file(fp: Path) -> bool: ].project return True except: - logger.critical("Skipping {a} as we can't read it as a SimaPro file", a=fp.name) + logger.critical( + "Skipping file %s as we can't read it as a SimaPro file", fp.name + ) return False @@ -46,7 +50,8 @@ def simapro_csv_biosphere_extractor(input_path: Path, output_path: Path) -> None process.blocks.values(), ): for line in block.parsed: - flows.add((line["context"], line["name"], line["unit"])) + # Restore context to single string as this is expected in our mapping + flows.add(("/".join(line["context"]), line["name"], line["unit"])) with open(output_path, "w") as f: json.dump( diff --git a/flowmapper/extraction/simapro_ecospold1.py b/src/flowmapper/extraction/simapro_ecospold1.py similarity index 64% rename from flowmapper/extraction/simapro_ecospold1.py rename to src/flowmapper/extraction/simapro_ecospold1.py index 148d99c..224e704 100644 --- a/flowmapper/extraction/simapro_ecospold1.py +++ b/src/flowmapper/extraction/simapro_ecospold1.py @@ -2,17 +2,6 @@ from pathlib import Path import pyecospold -from loguru import logger - -# def is_simapro_csv_file(fp: Path) -> bool: -# if not fp.is_file() or not fp.suffix.lower() == ".csv": -# return False -# try: -# bw_simapro_csv.header.parse_header(open(fp, encoding="sloppy-windows-1252"))[0].project -# return True -# except: -# logger.critical("Skipping {a} as we can't read it as a SimaPro file", a=fp.name) -# return False def simapro_ecospold1_biosphere_extractor(dirpath: Path, output_fp: Path) -> None: diff --git a/src/flowmapper/fields/__init__.py b/src/flowmapper/fields/__init__.py new file mode 100644 index 0000000..30192aa --- /dev/null +++ b/src/flowmapper/fields/__init__.py @@ -0,0 +1,24 @@ +"""Field classes and utilities for Flow attributes. + +This package contains field classes and related utilities used by Flow objects: +- CASField: Chemical Abstracts Service registry number field +- ContextField: Context field for flow categorization +- StringField: String field with normalization support +- OxidationState: Oxidation state representation +- Location utilities: Functions for extracting and manipulating location codes +""" + +from flowmapper.fields.cas import CASField +from flowmapper.fields.context import ContextField +from flowmapper.fields.location import replace_location_suffix, split_location_suffix +from flowmapper.fields.oxidation_state import OxidationState +from flowmapper.fields.string_field import StringField + +__all__ = [ + "CASField", + "ContextField", + "StringField", + "OxidationState", + "replace_location_suffix", + "split_location_suffix", +] diff --git a/src/flowmapper/fields/cas.py b/src/flowmapper/fields/cas.py new file mode 100644 index 0000000..880bb3d --- /dev/null +++ b/src/flowmapper/fields/cas.py @@ -0,0 +1,69 @@ +import re +from collections import UserString +from functools import cached_property +from typing import Any + +valid_cas = re.compile(r"^\s*[0-9]{2,7}-[0-9]{2}-[0-9]{1}\s*$") + + +class CASField(UserString): + def __init__(self, string: str): + if not isinstance(string, (str, UserString)): + raise TypeError( + f"CASField takes only `str`, but got {type(string)} for {string}" + ) + if not valid_cas.search(str(string)): + raise ValueError(f"Given input is not valid CAS formatting: '{string}'") + super().__init__(str(string)) + + def __eq__(self, other: Any) -> bool: + if isinstance(other, CASField): + return self.data == other.data + elif isinstance(other, (str, UserString)): + other_cas = CASField.from_string(str(other)) + if other_cas is None: + return False + return self.data == other_cas.data + return False + + @staticmethod + def from_string(string: str | None) -> "CASField | None": + """Returns `None` if CAS number is invalid""" + if string is None or not isinstance(string, (str, UserString)): + return None + new_cas = CASField(string.strip().lstrip("0").strip()) + if not new_cas.valid(): + return None + return new_cas + + @property + def digits(self) -> list[int]: + return [int(d) for d in self.data.replace("-", "")] + + def export(self): + return "{}-{}-{}".format( + "".join([str(x) for x in self.digits[:-3]]), + "".join([str(x) for x in self.digits[-3:-1]]), + self.digits[-1], + ) + + @cached_property + def check_digit_expected(self): + """ + Expected digit acording to https://www.cas.org/support/documentation/chemical-substances/checkdig algorithm + """ + result = ( + sum( + [ + index * value + for index, value in enumerate(self.digits[-2::-1], start=1) + ] + ) + % 10 + ) + return result + + def valid(self): + return (self.digits[-1] == self.check_digit_expected) and bool( + valid_cas.search(self.data) + ) diff --git a/src/flowmapper/fields/context.py b/src/flowmapper/fields/context.py new file mode 100644 index 0000000..b610296 --- /dev/null +++ b/src/flowmapper/fields/context.py @@ -0,0 +1,75 @@ +from collections.abc import Iterable +from typing import Any, Self + +from flowmapper.utils import as_normalized_tuple + +RESOURCE_CATEGORY = { + "natural resources", + "natural resource", + "resources", + "resource", + "land use", + "economic", + "social", + "raw materials", + "raw", +} + + +class ContextField: + def __init__(self, value: str | list[str] | tuple[str]): + self.value = value + + def normalize(self, obj: Any | None = None, mapping: dict | None = None) -> Self: + return type(self)(value=as_normalized_tuple(value=obj or self.value)) + + def is_resource(self) -> bool: + if isinstance(self.value, str): + return any(cat in self.value.lower() for cat in RESOURCE_CATEGORY) + else: + lowered = [elem.lower() for elem in self.value] + return any(cat in lowered for cat in RESOURCE_CATEGORY) + + def as_tuple(self) -> tuple | str: + if isinstance(self.value, str): + return self.value + return tuple(self.value) + + def export_as_string(self, join_character: str = "✂️"): + if isinstance(self.value, (list, tuple)): + return join_character.join(self.value) + return self.value + + def __iter__(self) -> Iterable: + return iter(self.value) + + def __eq__(self, other: Any) -> bool: + if self and other and isinstance(other, ContextField): + return self.value == other.value + else: + try: + return self.value == self.normalize(other).value + except ValueError: + return False + + def __repr__(self) -> str: + return str(self.value) + + def __bool__(self) -> bool: + return bool(self.value) + + def __hash__(self) -> int: + return hash(self.value) + + def __contains__(self, other: Any) -> bool: + """`self` context is more generic than the `other` context. + + ```python + Context("a/b") in Context("a/b/c") + >>> True + ``` + + """ + if not isinstance(other, ContextField): + return False + return self.value == other.value[: len(self.value)] diff --git a/src/flowmapper/fields/location.py b/src/flowmapper/fields/location.py new file mode 100644 index 0000000..d994e34 --- /dev/null +++ b/src/flowmapper/fields/location.py @@ -0,0 +1,153 @@ +""" +Location code extraction and manipulation utilities. + +This module provides functions for working with location codes that appear as +suffixes in flow names. Location codes are typically appended to flow names +in the format ", " where location is a recognized location code +from the places.json data file. + +The module uses a compiled regex pattern (ends_with_location) to identify +location codes at the end of strings, following the pattern of a comma, +whitespace, and a recognized location code. +""" + +import importlib.resources as resource +import json +import re +from pathlib import Path + +import structlog + +from flowmapper.errors import MissingLocation + +logger = structlog.get_logger("flowmapper") + +RESULTS_DIR = Path(__file__).parent.parent / "manual_matching" / "results" + +with resource.as_file( + resource.files("flowmapper") / "data" / "places.json" +) as filepath: + places = json.load(open(filepath)) + +# Compiled regex pattern that matches location codes at the end of strings. +# Pattern matches: comma (not preceded by whitespace), one or more spaces, +# followed by a recognized location code from places.json, optionally followed +# by whitespace, at the end of the string. +# The location code is captured in a named group "location". +ends_with_location = re.compile( + r"(?{})\s*$".format( + "|".join([re.escape(string) for string in places]) + ), +) +# All solutions I found for returning original string instead of +# lower case one were very ugly +# location_reverser = {obj.lower(): obj for obj in places} +# if len(location_reverser) != len(places): +# raise ValueError("Multiple possible locations after lower case conversion") + + +# us_lci_ends_with_location = re.compile( +# "/(?P{})$".format( +# "|".join( +# [ +# re.escape(string) +# for string in places +# if 2 <= len(string) <= 3 and string.upper() == string +# ] +# ) +# ), +# ) + +with resource.as_file( + resource.files("flowmapper") / "data" / "names_and_locations.json" +) as filepath: + names_and_locations = {o["source"]: o for o in json.load(open(filepath))} + + +def split_location_suffix(string: str) -> tuple[str, str | None]: + """ + Split a string into name and location code if a location suffix is present. + + This function searches for a location code at the end of the input string + using the ends_with_location regex pattern. If found, it returns the name + part (without the location suffix) and the location code. If no location + is found, it returns the original string and None. + + The location code must appear at the end of the string in the format + ", " where the comma is not preceded by whitespace, followed + by one or more spaces, and then a recognized location code. + + Parameters + ---------- + string : str + The input string that may contain a location suffix at the end. + + Returns + ------- + tuple[str, str | None] + A tuple containing: + - The name part without the location suffix (or original string if no + location found) + - The location code if found, otherwise None + + Examples + -------- + >>> split_location_suffix("Ammonia, NL") + ('Ammonia', 'NL') + >>> split_location_suffix("Ammonia, pure, NL") + ('Ammonia, pure', 'NL') + >>> split_location_suffix("Ammonia") + ('Ammonia', None) + >>> split_location_suffix("Ammonia, NL, pure") + ('Ammonia, NL, pure', None) + >>> split_location_suffix(", NL") + ('', 'NL') + """ + if match := ends_with_location.search(string): + return string[: match.start()], match.group("location") + return string, None + + +def replace_location_suffix(string: str, new_location: str) -> str: + """ + Replace the location value found by ends_with_location regex with a new value. + + If the string ends with a location code (matched by ends_with_location regex), + replace it with the new location value. If no location is found, raises + ValueError. + + Parameters + ---------- + string : str + The input string that must contain a location suffix at the end. + new_location : str + The new location value to replace the existing location with. + + Returns + ------- + str + The string with the location replaced. + + Raises + ------ + MissingLocation + If no location suffix is found in the input string. + + Examples + -------- + >>> replace_location_suffix("Ammonia, NL", "DE") + 'Ammonia, DE' + >>> replace_location_suffix("Ammonia, pure, NL", "FR") + 'Ammonia, pure, FR' + >>> replace_location_suffix("Ammonia", "DE") + Traceback (most recent call last): + ... + MissingLocation: No location suffix found in string 'Ammonia' + """ + if match := ends_with_location.search(string): + return ( + string[: match.start("location")] + + new_location + + string[match.end("location") :] + ) + raise MissingLocation(f"No location suffix found in string {string!r}") diff --git a/src/flowmapper/fields/oxidation_state.py b/src/flowmapper/fields/oxidation_state.py new file mode 100644 index 0000000..730f14d --- /dev/null +++ b/src/flowmapper/fields/oxidation_state.py @@ -0,0 +1,62 @@ +import re +from typing import Any, Self + +import roman + +roman_numberals_optional_parentheses = re.compile( + r"[\,\s]+\(?\s*(?P[IVX]+)\s*(?P[+-]*)\)?\s*$", + flags=re.IGNORECASE, +) +numbers_optional_parentheses = re.compile( + r"[\,\s]+\(?\s*(?P[+-]+)(?P[0-9]+)\)?\s*$" +) + + +class OxidationState: + def __init__(self, value: int): + self.value = value + + def __eq__(self, other: Any) -> bool: + if isinstance(other, OxidationState): + return self.value == other.value + else: + return self.value == other + + def __hash__(self) -> int: + return hash(self.value) + + def __repr__(self) -> str: + return str(self.value) + + @staticmethod + def has_oxidation_state(obj: str) -> bool: + return roman_numberals_optional_parentheses.search( + obj + ) or numbers_optional_parentheses.search(obj) + + @classmethod + def from_string(cls, obj: str) -> tuple[Self, str]: + if match := roman_numberals_optional_parentheses.search(obj): + obj_dict = match.groupdict() + try: + value = roman.fromRoman(obj_dict["numeral"].upper()) + except roman.InvalidRomanNumeralError: + raise ValueError( + f"{obj_dict['numeral']} in string {obj} is not a valid roman numeral" + ) + if "-" in obj_dict["sign"]: + value *= -1 + elif match := numbers_optional_parentheses.search(obj): + obj_dict = match.groupdict() + value = eval(obj_dict["numeral"].lstrip("0")) + if "-" in obj_dict["sign"]: + value *= -1 + else: + raise ValueError("No match found") + + if value < -5 or value > 9: + raise ValueError( + f"Oxidation state {value} from name {obj} is outside physical bounds of [-5, +9]" + ) + + return OxidationState(value), obj[: match.start()] diff --git a/src/flowmapper/fields/string_field.py b/src/flowmapper/fields/string_field.py new file mode 100644 index 0000000..33e975f --- /dev/null +++ b/src/flowmapper/fields/string_field.py @@ -0,0 +1,23 @@ +from collections import UserString +from typing import Any, Self + +from flowmapper.utils import normalize_str + + +class StringField(UserString): + def normalize(self, lowercase: bool = True) -> Self: + value = normalize_str(self.data) + if lowercase: + value = value.lower() + return type(self)(value) + + def __eq__(self, other: Any) -> bool: + if not self.data: + # Empty strings aren't equal for our use case + return False + elif isinstance(other, StringField): + return self.data == other.data + elif isinstance(other, str): + return self.data == other or self.data == normalize_str(other) + else: + return False diff --git a/src/flowmapper/flowmap.py b/src/flowmapper/flowmap.py new file mode 100644 index 0000000..6477ddd --- /dev/null +++ b/src/flowmapper/flowmap.py @@ -0,0 +1,723 @@ +from collections import Counter +from collections.abc import Callable +from functools import cached_property +from pathlib import Path +from time import time + +import pandas as pd +import randonneur +from structlog import get_logger + +from flowmapper import __version__ +from flowmapper.domain.flow import Flow +from flowmapper.domain.match import Match +from flowmapper.domain.normalized_flow import NormalizedFlow +from flowmapper.matching import match_rules +from flowmapper.utils import apply_transformation_and_convert_flows_to_normalized_flows + +logger = get_logger("flowmapper") + + +class Flowmap: + """ + Crosswalk of flows from a source flow list to a target flow list. + + The Flowmap class manages the mapping process between source and target flow lists + using a series of matching rules. It tracks matches, generates statistics, and + provides export functionality for various formats (randonneur, GLAD). + + The class applies matching rules sequentially to find correspondences between + source and target flows. As matches are found, source flows are marked as matched + and excluded from subsequent rule applications. New target flows can be created + during the matching process and added to the target flow list. + + Attributes + ---------- + source_flows : list[NormalizedFlow] + The list of source flows to be mapped. These flows are checked against + matching rules to find correspondences with target flows. + target_flows : list[NormalizedFlow] + The list of target flows for mapping. This list can grow during matching + if new target flows are created by matching rules. + matches : list[Match] + List of Match objects representing successful mappings between source + and target flows. Initially empty, populated by `generate_matches()`. + rules : list[Callable[..., list[Match]]] + List of matching rule functions to apply. Each rule is a callable that + takes source_flows and target_flows and returns a list of Match objects. + data_preparation_functions : list[Callable[..., list[NormalizedFlow]]] + List of transformation functions used to prepare flows for matching and + to normalize newly created target flows. + show_progressbar : bool + Whether to display a progress bar during matching (currently not used). + + Examples + -------- + >>> from flowmapper.domain.flow import Flow + >>> from flowmapper.domain.normalized_flow import NormalizedFlow + >>> from flowmapper.flowmap import Flowmap + >>> from copy import copy + >>> + >>> # Create source and target flows + >>> source_flow = Flow.from_dict({ + ... "name": "Carbon dioxide", + ... "context": "air", + ... "unit": "kg" + ... }) + >>> source_nf = NormalizedFlow( + ... original=source_flow, + ... normalized=source_flow.normalize(), + ... current=copy(source_flow.normalize()) + ... ) + >>> + >>> target_flow = Flow.from_dict({ + ... "name": "Carbon dioxide", + ... "context": "air", + ... "unit": "kg" + ... }) + >>> target_nf = NormalizedFlow( + ... original=target_flow, + ... normalized=target_flow.normalize(), + ... current=copy(target_flow.normalize()) + ... ) + >>> + >>> # Create Flowmap and generate matches + >>> flowmap = Flowmap( + ... source_flows=[source_nf], + ... target_flows=[target_nf], + ... data_preparation_functions=[] + ... ) + >>> flowmap.generate_matches() + >>> len(flowmap.matches) + 1 + """ + + def __init__( + self, + source_flows: list[NormalizedFlow], + target_flows: list[NormalizedFlow], + data_preparation_functions: list[Callable[..., list[NormalizedFlow]]], + rules: list[Callable[..., list[Match]]] | None = None, + show_progressbar: bool = True, + ): + """ + Initialize a Flowmap with source and target flows. + + Creates a new Flowmap instance to manage the mapping process between + source and target flow lists. The matching rules and data preparation + functions are set up for use during the matching process. + + Parameters + ---------- + source_flows : list[NormalizedFlow] + The list of source flows to be mapped. These flows will be checked + against matching rules to find correspondences with target flows. + target_flows : list[NormalizedFlow] + The list of target flows for mapping. This list can grow during + matching if new target flows are created. + data_preparation_functions : list[Callable[..., list[NormalizedFlow]]] + List of transformation functions used to prepare flows for matching. + These functions are also used to normalize newly created target flows + when they are added via `add_new_target_flows()`. + rules : list[Callable[..., list[Match]]] | None, optional + Custom matching rules to use. Each rule is a callable that takes + `source_flows` and `target_flows` as arguments and returns a list + of Match objects. If None, defaults to the rules returned by + `match_rules()`. + show_progressbar : bool, default=True + Whether to show a progress bar during matching (currently not + implemented). + + Notes + ----- + - The `matches` list is initialized as empty and populated by calling + `generate_matches()`. + - Source flows are filtered by their `matched` attribute during rule + application, so only unmatched flows are passed to each rule. + - New target flows created during matching are automatically normalized + using the data preparation functions before being added to the target + flow list. + + Examples + -------- + >>> from flowmapper.flowmap import Flowmap + >>> from flowmapper.matching import match_rules + >>> + >>> flowmap = Flowmap( + ... source_flows=[source_nf1, source_nf2], + ... target_flows=[target_nf1, target_nf2], + ... data_preparation_functions=[], + ... rules=match_rules() + ... ) + """ + self.show_progressbar = show_progressbar + self.rules = rules if rules else match_rules() + self.data_preparation_functions = data_preparation_functions + self.source_flows = source_flows + self.target_flows = target_flows + self.matches = [] + + @cached_property + def _matched_source_flows_ids(self) -> set[int]: + """Get a set of source flow IDs that have been matched. + + Returns + ------- + set[int] + Set of internal IDs (_id) from source flows that appear in matches. + Empty set if no matches exist. + + Notes + ----- + - This is a cached property used internally by `matched_source()` and + `unmatched_source` to efficiently determine which flows have been matched + - The cache is invalidated when `matches` changes + """ + return {match.source._id for match in self.matches} + + def generate_matches(self) -> None: + """Generate matches by applying all matching rules sequentially. + + This method iterates through all matching rules and applies them to + find correspondences between source and target flows. For each rule: + 1. Filters source flows to only include unmatched flows + 2. Calls the rule function with unmatched source flows and all target flows + 3. Extends the matches list with results from the rule + 4. If any matches create new target flows, adds them to the target flow list + 5. Logs the number of matches found and time taken + + After this method completes, the `matches` list contains all matches + found by all rules, and source flows that were matched will have their + `matched` attribute set to True. + + Notes + ----- + - Rules are applied in the order they appear in `self.rules` + - Each rule only receives source flows that haven't been matched yet + - New target flows are automatically normalized before being added + - The method logs information about each rule's performance + + Examples + -------- + >>> flowmap = Flowmap( + ... source_flows=[source_nf], + ... target_flows=[target_nf], + ... data_preparation_functions=[] + ... ) + >>> flowmap.generate_matches() + >>> len(flowmap.matches) + 1 + """ + for rule in self.rules: + start = time() + result = rule( + source_flows=[flow for flow in self.source_flows if not flow.matched], + target_flows=self.target_flows, + ) + elapsed = time() - start + + if new_target_flows := [ + obj.target for obj in result if obj.new_target_flow + ]: + self.add_new_target_flows(new_target_flows) + logger.info( + f"Match function {rule.__name__} produced {len(result)} matches and added {len(new_target_flows)} new target flows. It took {elapsed:.3} seconds." + ) + else: + logger.info( + f"Match function {rule.__name__} produced {len(result)} matches. It took {elapsed:.3} seconds." + ) + self.matches.extend(result) + + def add_new_target_flows(self, flows: list[Flow]) -> None: + """Add new target flows to the target flow list. + + This method is called automatically by `generate_matches()` when a + matching rule creates new target flows (indicated by `new_target_flow=True` + in Match objects). The new flows are normalized using the data + preparation functions before being added to the target flow list. + + Parameters + ---------- + flows : list[Flow] + List of Flow objects to add as new target flows. These flows are + normalized using `data_preparation_functions` before being added. + + Notes + ----- + - The flows are normalized using `apply_transformation_and_convert_flows_to_normalized_flows` + - Normalized flows are appended to `self.target_flows` + - This method is typically called automatically during `generate_matches()` + + Examples + -------- + >>> new_flow = Flow.from_dict({ + ... "name": "New flow", + ... "context": "air", + ... "unit": "kg" + ... }) + >>> flowmap.add_new_target_flows([new_flow]) + >>> len(flowmap.target_flows) + 2 + """ + normalized_flows = apply_transformation_and_convert_flows_to_normalized_flows( + functions=self.data_preparation_functions, flows=flows + ) + self.target_flows.extend(normalized_flows) + + def matched_source(self) -> list[NormalizedFlow]: + """Get a list of source flows that have been successfully matched. + + Returns all source flows that have been matched to at least one target + flow. A source flow is considered matched if its ID appears in any + Match object in the `matches` list. + + Returns + ------- + list[NormalizedFlow] + List of NormalizedFlow objects that have been matched. The list + is empty if no matches have been generated yet. + + Notes + ----- + - Uses the `_matched_source_flows_ids` cached property to determine + which flows have been matched + - Returns flows in the same order as they appear in `source_flows` + - Call `generate_matches()` first to populate matches + + Examples + -------- + >>> flowmap.generate_matches() + >>> matched = flowmap.matched_source() + >>> len(matched) + 1 + >>> matched[0].matched + True + """ + result = [ + flow + for flow in self.source_flows + if flow.id in self._matched_source_flows_ids + ] + return result + + @cached_property + def unmatched_source(self) -> list[NormalizedFlow]: + """Get a list of source flows that have not been matched. + + Returns all source flows that have not been matched to any target flow. + A source flow is considered unmatched if its ID does not appear in any + Match object in the `matches` list. + + Returns + ------- + list[NormalizedFlow] + List of NormalizedFlow objects that have not been matched. Returns + all source flows if no matches have been generated yet. + + Notes + ----- + - This is a cached property, so it's computed once and cached + - Uses the `_matched_source_flows_ids` cached property to determine + which flows have been matched + - Returns flows in the same order as they appear in `source_flows` + - The cache is invalidated if the `matches` list changes + + Examples + -------- + >>> flowmap.generate_matches() + >>> unmatched = flowmap.unmatched_source + >>> len(unmatched) + 0 + """ + result = [ + flow + for flow in self.source_flows + if flow.id not in self._matched_source_flows_ids + ] + return result + + def matched_source_statistics(self) -> pd.DataFrame: + """Calculate matching statistics grouped by source flow context. + + Computes statistics showing how many source flows were matched for each + context, including the total number of source flows per context and + the matching percentage. + + Returns + ------- + pd.DataFrame + DataFrame with columns: + - `context`: The context value + - `matched`: Number of matches for this context + - `total`: Total number of source flows in this context + - `percent`: Matching percentage (matched / total) + Rows are sorted by matching percentage (ascending). + + Notes + ----- + - Contexts with no matches will have `matched=0` + - Contexts with no source flows will have `total=0` + - Percentages are calculated as matched/total, which may be > 1.0 if + multiple matches exist per source flow + - Results are sorted by percentage (lowest first) + + Examples + -------- + >>> flowmap.generate_matches() + >>> stats = flowmap.matched_source_statistics() + >>> stats.columns.tolist() + ['context', 'matched', 'total', 'percent'] + """ + matched = pd.Series( + Counter([flow.source.context.value for flow in self.matches]) + ).reset_index() + matched.columns = ["context", "matched"] + + total = pd.Series( + Counter([flow.original.context.value for flow in self.source_flows]) + ).reset_index() + total.columns = ["context", "total"] + + df = pd.merge(matched, total, on="context", how="outer") + df = df.fillna(0).astype({"matched": "int", "total": "int"}) + + df["percent"] = df.matched / df.total + result = df.sort_values("percent") + return result + + @cached_property + def matched_target_statistics(self) -> pd.DataFrame: + """Calculate matching statistics grouped by target flow context. + + Computes statistics showing how many target flows were matched for each + context, including the total number of target flows per context and + the matching percentage. + + Returns + ------- + pd.DataFrame + DataFrame with columns: + - `context`: The context value + - `matched`: Number of matches for this context + - `total`: Total number of target flows in this context + - `percent`: Matching percentage (matched / total) + Rows are sorted by matching percentage (ascending). + + Notes + ----- + - This is a cached property, so it's computed once and cached + - Contexts with no matches will have `matched=0` + - Contexts with no target flows will have `total=0` + - Percentages are calculated as matched/total, which may be > 1.0 if + multiple matches exist per target flow + - Results are sorted by percentage (lowest first) + - The cache is invalidated if the `matches` or `target_flows` lists change + + Examples + -------- + >>> flowmap.generate_matches() + >>> stats = flowmap.matched_target_statistics + >>> stats.columns.tolist() + ['context', 'matched', 'total', 'percent'] + """ + matched = pd.Series( + Counter([flow.target.context.value for flow in self.matches]) + ).reset_index() + matched.columns = ["context", "matched"] + + total = pd.Series( + Counter([flow.original.context.value for flow in self.target_flows]) + ).reset_index() + total.columns = ["context", "total"] + + df = pd.merge(matched, total, on="context", how="outer") + df = df.fillna(0).astype({"matched": "int", "total": "int"}) + + df["percent"] = df.matched / df.total + result = df.sort_values("percent") + return result + + def print_statistics(self) -> None: + """Print summary statistics for the flow mapping process. + + Displays a formatted summary including: + - Number of source and target flows + - Total number of matches and percentage of source flows matched + - Cardinality distribution of mappings (1:1, 1:N, N:1, N:M) + + The output is printed to stdout in a human-readable format. + + Notes + ----- + - Percentage is calculated as matches / source_flows + - Cardinalities are computed by `cardinalities()` method + - This method prints to stdout, so it's suitable for interactive use + but may need to be captured or redirected in automated contexts + + Examples + -------- + >>> flowmap.generate_matches() + >>> flowmap.print_statistics() + 1 source and 1 target flows. + 1 mappings (100.00% of total). + Mappings cardinalities: {'1:1': 1} + """ + cardinalities = dict(Counter([x["cardinality"] for x in self.cardinalities()])) + percentage = ( + len(self.matches) / len(self.source_flows) if self.source_flows else 0.0 + ) + print( + f"""{len(self.source_flows)} source and {len(self.target_flows)} target flows. +{len(self.matches)} mappings ({percentage:.2%} of total). +Mappings cardinalities: {str(cardinalities)}""" + ) + + def cardinalities(self) -> list[dict[str, int | str]]: + """Calculate and return the cardinality of each mapping. + + Determines the relationship type (1:1, 1:N, N:1, or N:M) for each + match based on how many matches each source and target flow participate in. + + Returns + ------- + list[dict[str, int | str]] + List of dictionaries, each containing: + - `from`: Source flow internal ID + - `to`: Target flow internal ID + - `cardinality`: Relationship type as string ("1:1", "1:N", "N:1", or "N:M") + Results are sorted by source flow ID. + + Notes + ----- + - **1:1**: One source maps to one target, and that target maps only to this source + - **1:N**: One source maps to multiple targets + - **N:1**: Multiple sources map to the same target + - **N:M**: Multiple sources map to multiple targets (many-to-many) + - Cardinality is determined by counting how many matches each source + and target flow ID appears in + + Examples + -------- + >>> flowmap.generate_matches() + >>> card = flowmap.cardinalities() + >>> card[0] + {'from': 0, 'to': 0, 'cardinality': '1:1'} + """ + mappings = [(match.source._id, match.target._id) for match in self.matches] + lhs_counts = Counter([pair[0] for pair in mappings]) + rhs_counts = Counter([pair[1] for pair in mappings]) + + result = [] + + for lhs, rhs in mappings: + lhs_count = lhs_counts[lhs] + rhs_count = rhs_counts[rhs] + if lhs_count == 1 and rhs_count == 1: + result.append({"from": lhs, "to": rhs, "cardinality": "1:1"}) + elif lhs_count == 1 and rhs_count > 1: + result.append({"from": lhs, "to": rhs, "cardinality": "N:1"}) + elif lhs_count > 1 and rhs_count == 1: + result.append({"from": lhs, "to": rhs, "cardinality": "1:N"}) + elif lhs_count > 1 and rhs_count > 1: + result.append({"from": lhs, "to": rhs, "cardinality": "N:M"}) + + return sorted(result, key=lambda x: x["from"]) + + def to_randonneur( + self, + source_id: str, + target_id: str, + contributors: list, + mapping_source: dict, + mapping_target: dict, + version: str = "1.0.0", + licenses: list | None = None, + homepage: str | None = None, + name: str | None = None, + path: Path | None = None, + ) -> randonneur.Datapackage: + """Export mappings in randonneur data migration format. + + Creates a randonneur Datapackage containing all matches in a format + suitable for data migration and transformation workflows. The datapackage + can be saved to disk or returned for further processing. + + Parameters + ---------- + source_id : str + Identifier for the source flow list (e.g., "ecoinvent-3.8"). + target_id : str + Identifier for the target flow list (e.g., "ecoinvent-3.9"). + contributors : list + List of contributor information for the datapackage metadata. + mapping_source : dict + Mapping configuration for source flows (randonneur format). + mapping_target : dict + Mapping configuration for target flows (randonneur format). + version : str, default="1.0.0" + Version string for the datapackage. + licenses : list | None, optional + License information for the datapackage. + homepage : str | None, optional + Homepage URL for the datapackage. + name : str | None, optional + Name for the datapackage. If None, defaults to "{source_id}-{target_id}". + path : Path | None, optional + If provided, saves the datapackage as JSON to this path. + + Returns + ------- + randonneur.Datapackage + A Datapackage object containing all matches with verb "update". + The datapackage includes metadata and can be saved to disk if + `path` is provided. + + Notes + ----- + - All matches are exported using their `export()` method + - The datapackage description includes the flowmapper version + - If `path` is provided, the parent directory is created if it doesn't exist + + Examples + -------- + >>> dp = flowmap.to_randonneur( + ... source_id="source-v1", + ... target_id="target-v1", + ... contributors=[], + ... mapping_source={}, + ... mapping_target={} + ... ) + >>> isinstance(dp, randonneur.Datapackage) + True + """ + dp = randonneur.Datapackage( + name=name or f"{source_id}-{target_id}", + source_id=source_id, + target_id=target_id, + description=f"Flowmapper {__version__} elementary flow correspondence from {source_id} to {target_id}", + contributors=contributors, + mapping_source=mapping_source, + mapping_target=mapping_target, + homepage=homepage, + version=version, + licenses=licenses, + ) + + dp.add_data(verb="update", data=[match.export() for match in self.matches]) + + if path is not None: + dp.to_json(path) + return dp + + def to_glad( + self, + path: Path | None = None, + ensure_id: bool = False, + missing_source: bool = False, + ) -> pd.DataFrame | None: + """Export mappings in GLAD (Global LCA Data Access) format. + + Creates a DataFrame or Excel file in the GLAD flow mapping format, + which is a standardized format for exchanging flow mappings in the + LCA community. + + Parameters + ---------- + path : Path | None, optional + If provided, exports the DataFrame to an Excel file at this path. + If None, returns the DataFrame without saving. + ensure_id : bool, default=False + If True, replaces None identifiers with empty strings. If False, + None identifiers remain as None in the DataFrame. + missing_source : bool, default=False + If True, includes unmatched source flows in the output with only + source flow information (no target flow data). + + Returns + ------- + pd.DataFrame | None + DataFrame with GLAD format columns: + - SourceFlowName, SourceFlowUUID, SourceFlowContext, SourceUnit + - MatchCondition, ConversionFactor + - TargetFlowName, TargetFlowUUID, TargetFlowContext, TargetUnit + - MemoMapper + Returns None if `path` is provided (file is saved instead). + + Notes + ----- + - If `path` is provided, creates an Excel file with auto-sized columns + - Unmatched source flows (when `missing_source=True`) only include + source flow columns, with target columns left empty + - Context values are exported as strings using "/" as separator + - Match conditions are converted using `MatchCondition.as_glad()` + - Excel files use xlsxwriter engine with formulas disabled + + Examples + -------- + >>> df = flowmap.to_glad() + >>> df.columns.tolist() + ['SourceFlowName', 'SourceFlowUUID', ...] + >>> + >>> # Export to Excel + >>> flowmap.to_glad(path=Path("mapping.xlsx")) + """ + data = [] + for match in self.matches: + data.append( + { + "SourceFlowName": str(match.source.name), + "SourceFlowUUID": match.source.identifier + or ("" if ensure_id else None), + "SourceFlowContext": match.source.context.export_as_string( + join_character="/" + ), + "SourceUnit": str(match.source.unit), + "MatchCondition": match.condition.as_glad(), + "ConversionFactor": match.conversion_factor, + "TargetFlowName": str(match.target.name), + "TargetFlowUUID": match.target.identifier + or ("" if ensure_id else None), + "TargetFlowContext": match.target.context.export_as_string( + join_character="/" + ), + "TargetUnit": str(match.target.unit), + "MemoMapper": match.comment, + } + ) + + if missing_source: + for flow_obj in filter(lambda x: not x.matched, self.source_flows): + data.append( + { + "SourceFlowName": str(flow_obj.original.name), + "SourceFlowUUID": flow_obj.original.identifier + or ("" if ensure_id else None), + "SourceFlowContext": flow_obj.original.context.export_as_string(), + "SourceUnit": str(flow_obj.original.unit), + } + ) + + result = pd.DataFrame(data) + + if not path: + return result + else: + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + + writer = pd.ExcelWriter( + path, + engine="xlsxwriter", + engine_kwargs={"options": {"strings_to_formulas": False}}, + ) + result.to_excel(writer, sheet_name="Mapping", index=False, na_rep="NaN") + + for column in result: + column_length = max( + result[column].astype(str).map(len).max(), len(column) + ) + col_idx = result.columns.get_loc(column) + writer.sheets["Mapping"].set_column(col_idx, col_idx, column_length) + + writer.close() diff --git a/src/flowmapper/main.py b/src/flowmapper/main.py new file mode 100644 index 0000000..c2cb7b1 --- /dev/null +++ b/src/flowmapper/main.py @@ -0,0 +1,114 @@ +import json +import logging +from collections.abc import Callable +from pathlib import Path + +from randonneur import Datapackage +from randonneur_data import Registry + +from flowmapper.domain.flow import Flow +from flowmapper.domain.match import Match +from flowmapper.flowmap import Flowmap +from flowmapper.utils import ( + apply_transformation_and_convert_flows_to_normalized_flows, + randonneur_as_function, +) + +logger = logging.getLogger(__name__) + + +def sorting_function(obj: dict) -> tuple: + return ( + obj.get("name", "ZZZ"), + str(obj.get("context", "ZZZ")), + obj.get("unit", "ZZZ"), + ) + + +def flowmapper( + source: Path, + target: Path, + source_id: str, + target_id: str, + contributors: list, + output_dir: Path, + version: str = "1.0.0", + transformations: list[Datapackage | str | dict | Callable] | None = None, + rules: list[Callable[..., list[Match]]] | None = None, + unit_normalization: bool = True, + licenses: list | None = None, + homepage: str | None = None, + name: str | None = None, + registry: Registry | None = None, + no_matching: bool = False, +) -> Flowmap: + """ + Generate mappings between elementary flows lists + """ + output_dir.mkdir(parents=True, exist_ok=True) + transformation_functions = [] + + if transformations is None: + transformations = [] + + if unit_normalization: + transformations.append("Flowmapper-standard-units-harmonization") + + for obj in transformations: + if isinstance(obj, (str, dict, Datapackage)): + transformation_functions.append( + randonneur_as_function(datapackage=obj, registry=registry) + ) + elif isinstance(obj, Callable): + transformation_functions.append(obj) + else: + raise ValueError(f"Can't understand transformation {obj}") + + original_source_flows = [Flow.from_dict(obj) for obj in json.load(open(source))] + source_flows = apply_transformation_and_convert_flows_to_normalized_flows( + functions=transformation_functions, flows=original_source_flows + ) + + original_target_flows = [Flow.from_dict(obj) for obj in json.load(open(target))] + target_flows = apply_transformation_and_convert_flows_to_normalized_flows( + functions=transformation_functions, flows=original_target_flows + ) + + flowmap = Flowmap( + source_flows=source_flows, + target_flows=target_flows, + data_preparation_functions=transformation_functions, + rules=rules, + ) + if no_matching: + return flowmap + flowmap.generate_matches() + flowmap.print_statistics() + + stem = f"{source.stem}-{target.stem}" + + with open(output_dir / f"{stem}-unmatched-source.json", "w") as fs: + json.dump( + sorted( + [flow.export() for flow in source_flows if not flow.matched], + key=sorting_function, + ), + fs, + indent=True, + ) + + flowmap.to_randonneur( + source_id=source_id, + target_id=target_id, + contributors=contributors, + mapping_source=Flow.randonneur_mapping(), + mapping_target=Flow.randonneur_mapping(), + version=version, + licenses=licenses, + homepage=homepage, + name=name, + path=output_dir / f"{stem}.json", + ) + flowmap.to_glad(output_dir / f"{stem}.xlsx", missing_source=True) + + return flowmap diff --git a/flowmapper/manual_matching/data/simapro_ecoinvent_310/just_different.json b/src/flowmapper/manual_matching/data/simapro_ecoinvent_310/just_different.json similarity index 88% rename from flowmapper/manual_matching/data/simapro_ecoinvent_310/just_different.json rename to src/flowmapper/manual_matching/data/simapro_ecoinvent_310/just_different.json index db938de..aea33a3 100644 --- a/flowmapper/manual_matching/data/simapro_ecoinvent_310/just_different.json +++ b/src/flowmapper/manual_matching/data/simapro_ecoinvent_310/just_different.json @@ -1,4 +1,12 @@ [ + { + "source": { + "name": "water, well" + }, + "target": { + "name": "water, well, in ground" + } + }, { "source": { "name": "Parathion, methyl" @@ -79,6 +87,14 @@ "name": "AOX, Adsorbable Organic Halides" } }, + { + "source": { + "name": "AOX, Adsorbable Organic Halogen" + }, + "target": { + "name": "AOX, Adsorbable Organic Halides" + } + }, { "source": { "name": "AOX, Adsorbable Organic Halogen as Cl" @@ -143,6 +159,14 @@ "name": "Sand, unspecified" } }, + { + "source": { + "name": "Sand, quartz" + }, + "target": { + "name": "Sand, unspecified" + } + }, { "source": { "name": "Potassium chloride" @@ -175,14 +199,6 @@ "name": "2,2,4-Trimethylpentane" } }, - { - "source": { - "name": "2,4-D, dimethylamine salt" - }, - "target": { - "name": "2,4-D dimethylamine salt" - } - }, { "source": { "name": "Dioxin, 2,3,7,8 Tetrachlorodibenzo-p-" @@ -241,19 +257,25 @@ }, { "source": { - "name": "Gas, natural, 36 MJ per m3" + "name": "Gas, natural, 36 MJ per m3", + "unit": "cubic_meter" }, "target": { - "name": "Gas, natural, in ground" - } + "name": "Gas, natural", + "unit": "standard_cubic_meter" + }, + "conversion_factor": 1.0 }, { "source": { - "name": "Gas, mine, off-gas, process, coal mining, 36 MJ per m3" + "name": "Gas, mine, off-gas, process, coal mining, 36 MJ per m3", + "unit": "cubic_meter" }, "target": { - "name": "Gas, mine, off-gas, process, coal mining" - } + "name": "Gas, mine, off-gas, process, coal mining", + "unit": "standard_cubic_meter" + }, + "conversion_factor": 1.0 }, { "source": { @@ -303,6 +325,14 @@ "name": "Caesium I" } }, + { + "source": { + "name": "Cesium" + }, + "target": { + "name": "Caesium" + } + }, { "source": { "name": "Dimethyl formamide" @@ -396,11 +426,7 @@ "name": "Roundup" }, "target": { - "name": "Glyphosate", - "context": [ - "air", - "non-urban air or from high stacks" - ] + "name": "Glyphosate" } } ] diff --git a/flowmapper/manual_matching/data/simapro_ecoinvent_310/land_use_not_in_ecoinvent.json b/src/flowmapper/manual_matching/data/simapro_ecoinvent_310/land_use_not_in_ecoinvent.json similarity index 98% rename from flowmapper/manual_matching/data/simapro_ecoinvent_310/land_use_not_in_ecoinvent.json rename to src/flowmapper/manual_matching/data/simapro_ecoinvent_310/land_use_not_in_ecoinvent.json index 629c5b0..159cd6c 100644 --- a/flowmapper/manual_matching/data/simapro_ecoinvent_310/land_use_not_in_ecoinvent.json +++ b/src/flowmapper/manual_matching/data/simapro_ecoinvent_310/land_use_not_in_ecoinvent.json @@ -219,14 +219,6 @@ "name": "Transformation, to seabed, unspecified" } }, - { - "source": { - "name": "Transformation, to industrial area, built up" - }, - "target": { - "name": "Transformation, to industrial area" - } - }, { "source": { "name": "Transformation, to industrial area, vegetation" @@ -283,6 +275,14 @@ "name": "Transformation, to river, artificial" } }, + { + "source": { + "name": "Transformation, from urban, green areas" + }, + "target": { + "name": "Transformation, from urban, green area" + } + }, { "source": { "name": "Transformation, to water courses, artificial" diff --git a/flowmapper/manual_matching/data/simapro_ecoinvent_310/ores.json b/src/flowmapper/manual_matching/data/simapro_ecoinvent_310/ores.json similarity index 87% rename from flowmapper/manual_matching/data/simapro_ecoinvent_310/ores.json rename to src/flowmapper/manual_matching/data/simapro_ecoinvent_310/ores.json index 80e7508..1e3ee5b 100644 --- a/flowmapper/manual_matching/data/simapro_ecoinvent_310/ores.json +++ b/src/flowmapper/manual_matching/data/simapro_ecoinvent_310/ores.json @@ -87,15 +87,6 @@ "name": "Energy, gross calorific value, in biomass" } }, - { - "source": { - "name": "Gas, mine, off-gas, process, coal mining/m3" - }, - "target": { - "name": "Gas, mine, off-gas, process, coal mining", - "unit": "Sm3" - } - }, { "source": { "name": "Silver, Ag 9.7E-4%, Au 9.7E-4%, Zn 0.63%, Cu 0.38%, Pb 0.014%, in ore" @@ -199,5 +190,35 @@ "target": { "name": "Rhodium" } + }, + { + "source": { + "name": "Gas, mine, off-gas, process, coal mining/m3", + "unit": "cubic_meter" + }, + "target": { + "name": "Gas, mine, off-gas, process, coal mining", + "unit": "standard_cubic_meter" + }, + "conversion_factor": 1.0 + }, + { + "source": { + "name": "Gas, natural/m3", + "unit": "cubic_meter" + }, + "target": { + "name": "Gas, natural", + "unit": "standard_cubic_meter" + }, + "conversion_factor": 1.0 + }, + { + "source": { + "name": "Energy, from hydro power" + }, + "target": { + "name": "Energy, potential (in hydropower reservoir), converted" + } } ] diff --git a/flowmapper/manual_matching/data/simapro_ecoinvent_310/regionalized_random.json b/src/flowmapper/manual_matching/data/simapro_ecoinvent_310/regionalized_random.json similarity index 100% rename from flowmapper/manual_matching/data/simapro_ecoinvent_310/regionalized_random.json rename to src/flowmapper/manual_matching/data/simapro_ecoinvent_310/regionalized_random.json diff --git a/flowmapper/manual_matching/data/simapro_ecoinvent_310/unit_conversions.json b/src/flowmapper/manual_matching/data/simapro_ecoinvent_310/unit_conversions.json similarity index 78% rename from flowmapper/manual_matching/data/simapro_ecoinvent_310/unit_conversions.json rename to src/flowmapper/manual_matching/data/simapro_ecoinvent_310/unit_conversions.json index 306cd0f..ca0ef0b 100644 --- a/flowmapper/manual_matching/data/simapro_ecoinvent_310/unit_conversions.json +++ b/src/flowmapper/manual_matching/data/simapro_ecoinvent_310/unit_conversions.json @@ -2,35 +2,33 @@ { "source": { "name": "Gas, mine, off-gas, process, coal mining/m3", - "unit": "cubic meter", - "context": ["natural resource", "in ground"] + "unit": "cubic_meter" }, "target": { "name": "Gas, mine, off-gas, process, coal mining", - "unit": "Sm3" + "unit": "standard_cubic_meter" }, "conversion_factor": 1.0 }, { "source": { "name": "Gas, natural/m3", - "unit": "m3", - "context": ["natural resource", "in ground"] + "unit": "cubic_meter" }, "target": { "name": "Gas, natural", - "unit": "Sm3" + "unit": "standard_cubic_meter" }, "conversion_factor": 1.0 }, { "source": { "name": "Energy, from peat", - "unit": "MJ" + "unit": "megajoule" }, "target": { "name": "Peat", - "unit": "kg" + "unit": "kilogram" }, "conversion_factor": 9.9, "comment": "Conversion factor from ecoinvent 3.10 CED LCIA factors" @@ -38,11 +36,11 @@ { "source": { "name": "Energy, from uranium", - "unit": "MJ" + "unit": "megajoule" }, "target": { "name": "Uranium", - "unit": "kg" + "unit": "kilogram" }, "conversion_factor": 560000.0, "comment": "Conversion factor from ecoinvent 3.10 CED LCIA factors" @@ -50,11 +48,11 @@ { "source": { "name": "Energy, from coal, brown", - "unit": "MJ" + "unit": "megajoule" }, "target": { "name": "Coal, brown", - "unit": "kg" + "unit": "kilogram" }, "conversion_factor": 9.9, "comment": "Conversion factor from ecoinvent 3.10 CED LCIA factors" @@ -62,11 +60,11 @@ { "source": { "name": "Energy, from gas, natural", - "unit": "MJ" + "unit": "megajoule" }, "target": { "name": "Gas, natural", - "unit": "Sm3" + "unit": "standard_cubic_meter" }, "conversion_factor": 40.3, "comment": "Conversion factor from ecoinvent 3.10 CED LCIA factors" @@ -74,11 +72,11 @@ { "source": { "name": "Energy, from oil sand (10% bitumen)", - "unit": "MJ" + "unit": "megajoule" }, "target": { "name": "Oil, crude", - "unit": "kg" + "unit": "kilogram" }, "conversion_factor": 45.8, "comment": "Conversion factor from ecoinvent 3.10 CED LCIA factors" @@ -86,11 +84,11 @@ { "source": { "name": "Energy, from oil sand (100% bitumen)", - "unit": "MJ" + "unit": "megajoule" }, "target": { "name": "Oil, crude", - "unit": "kg" + "unit": "kilogram" }, "conversion_factor": 45.8, "comment": "Conversion factor from ecoinvent 3.10 CED LCIA factors" @@ -101,8 +99,6 @@ }, "target": { "name": "Energy, potential (in hydropower reservoir), converted" - }, - "conversion_factor": 40.3, - "comment": "Conversion factor from ecoinvent 3.10 CED LCIA factors" + } } ] diff --git a/flowmapper/manual_matching/data/simapro_ecoinvent_310/water_misc.json b/src/flowmapper/manual_matching/data/simapro_ecoinvent_310/water_misc.json similarity index 100% rename from flowmapper/manual_matching/data/simapro_ecoinvent_310/water_misc.json rename to src/flowmapper/manual_matching/data/simapro_ecoinvent_310/water_misc.json diff --git a/flowmapper/manual_matching/results/simapro-2024-ecoinvent-3.10-biosphere.json b/src/flowmapper/manual_matching/results/simapro-2024-ecoinvent-3.10-biosphere.json similarity index 100% rename from flowmapper/manual_matching/results/simapro-2024-ecoinvent-3.10-biosphere.json rename to src/flowmapper/manual_matching/results/simapro-2024-ecoinvent-3.10-biosphere.json diff --git a/flowmapper/manual_matching/simapro_ecoinvent_310.py b/src/flowmapper/manual_matching/simapro_ecoinvent_310.py similarity index 71% rename from flowmapper/manual_matching/simapro_ecoinvent_310.py rename to src/flowmapper/manual_matching/simapro_ecoinvent_310.py index 8bf554c..7c760f8 100644 --- a/flowmapper/manual_matching/simapro_ecoinvent_310.py +++ b/src/flowmapper/manual_matching/simapro_ecoinvent_310.py @@ -1,8 +1,8 @@ -import randonneur as rn -import randonneur_data as rd -from pathlib import Path import json +from pathlib import Path +import randonneur as rn +import randonneur_data as rd data_dir = Path(__file__).parent / "data" / "simapro_ecoinvent_310" results_dir = Path(__file__).parent / "results" @@ -24,25 +24,21 @@ def generate_simapro_ecoinvent_310_manual_matches( "ores.json", ] non_resources = { - 'Caesium': 'Caesium I', - 'Calcium': 'Calcium II', - 'Sodium': 'Sodium I', - 'Strontium': 'Strontium II', + "Caesium": "Caesium I", + "Calcium": "Calcium II", + "Sodium": "Sodium I", + "Strontium": "Strontium II", } non_resource_categories = [ - obj['source']['context'] - for obj in json.load(open(base_data_dir / "simapro-2023-ecoinvent-3-contexts.json"))["update"] - if obj['target']['context'][0] != "natural resource" + obj["source"]["context"] + for obj in json.load( + open(base_data_dir / "simapro-2023-ecoinvent-3-contexts.json") + )["update"] + if obj["target"]["context"][0] != "natural resource" ] data = [ - { - 'source': { - 'name': key, - 'context': context - }, - 'target': {'name': value} - } + {"source": {"name": key, "context": context}, "target": {"name": value}} for key, value in non_resources.items() for context in non_resource_categories ] @@ -52,18 +48,17 @@ def generate_simapro_ecoinvent_310_manual_matches( registry = rd.Registry() migration = registry.get_file("ecoinvent-3.9.1-biosphere-ecoinvent-3.10-biosphere") name_change = { - (pair['source']['name'], pair['target']['name']) - for pair in migration['replace'] - if 'name' in pair['target'] - and 'name' in pair['source'] - and pair['source']['name'] != pair['target']['name'] - and pair['source']['name'] not in non_resources + (pair["source"]["name"], pair["target"]["name"]) + for pair in migration["replace"] + if "name" in pair["target"] + and "name" in pair["source"] + and pair["source"]["name"] != pair["target"]["name"] + and pair["source"]["name"] not in non_resources } assert len(name_change) == len({a for a, b in name_change}) - data.extend([ - {'source': {'name': a}, 'target': {'name': b}} - for a, b in name_change - ]) + data.extend( + [{"source": {"name": a}, "target": {"name": b}} for a, b in name_change] + ) dp = rn.Datapackage( name="SimaPro-2024-to-ecoinvent-3.10-elementary-flows", diff --git a/src/flowmapper/matching/__init__.py b/src/flowmapper/matching/__init__.py new file mode 100644 index 0000000..9df92c6 --- /dev/null +++ b/src/flowmapper/matching/__init__.py @@ -0,0 +1,63 @@ +"""Matching functions for flow mapping. + +This package contains functions for matching flows between source and target +flow lists. Functions are organized by type: + +- core: Core utilities for transformation and matching +- basic: Basic matching functions (identical names, CAS numbers, etc.) +- transformation: Transformation-based matching functions +- context: Context-based matching functions +- specialized: Specialized matching functions (regionalized flows, suffixes) +- rules: Default matching rules configuration +""" + +from flowmapper.matching.basic import ( + match_close_names, + match_identical_cas_numbers, + match_identical_identifier, + match_identical_names, + match_identical_names_lowercase, + match_identical_names_without_commas, +) +from flowmapper.matching.context import ( + match_name_and_parent_context, + match_resources_with_wrong_subcontext, +) +from flowmapper.matching.core import get_matches, transform_and_then_match +from flowmapper.matching.ecoinvent import match_ecoinvent_transitive_matching +from flowmapper.matching.rules import match_rules, match_rules_simapro_ecoinvent +from flowmapper.matching.simapro import ( + manual_simapro_ecoinvent_mapping, + simapro_ecoinvent_glad_name_matching, +) +from flowmapper.matching.specialized import ( + add_missing_regionalized_flows, + match_names_with_suffix_removal, +) + +__all__ = [ + # Core + "transform_and_then_match", + "get_matches", + # Basic + "match_identical_identifier", + "match_identical_cas_numbers", + "match_identical_names", + "match_close_names", + "match_identical_names_lowercase", + "match_identical_names_without_commas", + # Transformation + "match_ecoinvent_transitive_matching", + # Context + "match_resources_with_wrong_subcontext", + "match_name_and_parent_context", + # Specialized + "add_missing_regionalized_flows", + "match_names_with_suffix_removal", + # Rules + "match_rules", + "match_rules_simapro_ecoinvent", + # SimaPro + "manual_simapro_ecoinvent_mapping", + "simapro_ecoinvent_glad_name_matching", +] diff --git a/src/flowmapper/matching/basic.py b/src/flowmapper/matching/basic.py new file mode 100644 index 0000000..944723a --- /dev/null +++ b/src/flowmapper/matching/basic.py @@ -0,0 +1,473 @@ +"""Basic matching functions. + +This module contains basic matching functions that match flows based on +identical or similar attributes without transformations. +""" + +import re + +from rapidfuzz.distance.DamerauLevenshtein import distance + +from flowmapper.domain.match_condition import MatchCondition +from flowmapper.domain.normalized_flow import NormalizedFlow +from flowmapper.matching.core import get_matches +from flowmapper.utils import toolz + + +def match_identical_identifier( + source_flows: list[NormalizedFlow], + target_flows: list[NormalizedFlow], +) -> list: + """Match flows with identical identifiers. + + This function groups source flows by their identifier and matches them + to target flows with the same identifier. Only flows with non-None + identifiers are considered. + + Parameters + ---------- + source_flows : list[NormalizedFlow] + List of source flows to match. + target_flows : list[NormalizedFlow] + List of target flows to match against. + + Returns + ------- + list[Match] + List of Match objects with MatchCondition.exact for flows with + matching identifiers. + + Notes + ----- + - Only flows with non-None identifiers are matched + - If multiple target flows share the same identifier, `get_matches` will + only allow a single result target per source flow + - Match condition is always MatchCondition.exact + """ + matches = [] + + for source_id, sources in toolz.itertoolz.groupby( + lambda x: x.identifier, source_flows + ).items(): + if not source_id: + continue + matches.extend( + get_matches( + source_flows=sources, + # Filter target flows with matching identifier. We don't need to worry about + # duplicate identifiers as `get_matches` will only allow a single result target + target_flows=[ + flow for flow in target_flows if source_id == flow.identifier + ], + comment=f"Shared target-unique identifier: {source_id}", + function_name="match_identical_identifier", + match_condition=MatchCondition.exact, + ) + ) + + return matches + + +def match_identical_cas_numbers( + source_flows: list[NormalizedFlow], target_flows: list[NormalizedFlow] +) -> list: + """Match flows with identical CAS numbers, context, and location. + + This function matches flows that share the same CAS (Chemical Abstracts + Service) registry number, context, and location. All three attributes + must match exactly. + + Parameters + ---------- + source_flows : list[NormalizedFlow] + List of source flows to match. + target_flows : list[NormalizedFlow] + List of target flows to match against. + + Returns + ------- + list[Match] + List of Match objects with MatchCondition.exact for flows with + matching CAS numbers, context, and location. + + Notes + ----- + - CAS number, context, and location must all match exactly + - Match condition is always MatchCondition.exact + - Only unit-compatible flows are matched + """ + matches = [] + + for (cas_number, context, location), sources in toolz.itertoolz.groupby( + lambda x: (x.cas_number, x.context, x.location), source_flows + ).items(): + matches.extend( + get_matches( + source_flows=sources, + target_flows=[ + flow + for flow in target_flows + if flow.cas_number == cas_number + and flow.context == context + and flow.location == location + ], + comment=f"Shared CAS code with identical context and location: {cas_number}", + function_name="match_identical_cas_numbers", + match_condition=MatchCondition.exact, + ) + ) + + return matches + + +def match_identical_names( + source_flows: list[NormalizedFlow], + target_flows: list[NormalizedFlow], + function_name: str | None = None, + comment: str | None = None, + match_condition: MatchCondition | None = None, +) -> list: + """Match flows with identical normalized names, context, oxidation state, and location. + + This is one of the most precise matching functions, requiring exact matches + on normalized name, context, oxidation state, and location. All four + attributes must match exactly. + + Parameters + ---------- + source_flows : list[NormalizedFlow] + List of source flows to match. + target_flows : list[NormalizedFlow] + List of target flows to match against. + + Returns + ------- + list[Match] + List of Match objects with MatchCondition.exact for flows with + identical normalized names, context, oxidation state, and location. + + Notes + ----- + - All four attributes (name, context, oxidation_state, location) must match exactly + - Names are compared after normalization + - Match condition is always MatchCondition.exact + - Only unit-compatible flows are matched + """ + matches = [] + + for (name, context, oxidation_state, location), sources in toolz.itertoolz.groupby( + lambda x: (x.name, x.context, x.oxidation_state, x.location), source_flows + ).items(): + matches.extend( + get_matches( + source_flows=sources, + target_flows=[ + target + for target in target_flows + if target.name == name + and target.context == context + and target.oxidation_state == oxidation_state + and target.location == location + ], + comment=comment + or f"Shared normalized name with identical context, oxidation state, and location: {name}", + function_name=function_name or "match_identical_names", + match_condition=match_condition or MatchCondition.exact, + ) + ) + + return matches + + +def match_close_names( + source_flows: list[NormalizedFlow], target_flows: list[NormalizedFlow] +) -> list: + """Match flows with similar names using Damerau-Levenshtein distance. + + This function matches flows where the normalized names have a Damerau- + Levenshtein edit distance of less than 3, while still requiring exact + matches on context, oxidation state, and location. + + Parameters + ---------- + source_flows : list[NormalizedFlow] + List of source flows to match. + target_flows : list[NormalizedFlow] + List of target flows to match against. + + Returns + ------- + list[Match] + List of Match objects with MatchCondition.related for flows with + similar names (edit distance < 3) and identical context, oxidation + state, and location. + + Notes + ----- + - Uses Damerau-Levenshtein distance with case-insensitive comparison + - Edit distance must be less than 3 (i.e., 0, 1, or 2) + - Context, oxidation state, and location must still match exactly + - Match condition is MatchCondition.related (not exact due to name differences) + - Only unit-compatible flows are matched + """ + matches = [] + + for (name, context, oxidation_state, location), sources in toolz.itertoolz.groupby( + lambda x: (x.name, x.context, x.oxidation_state, x.location), source_flows + ).items(): + matches.extend( + get_matches( + source_flows=sources, + target_flows=[ + target + for target in target_flows + if distance( + str(target.name), str(name), processor=lambda x: x.lower() + ) + < 3 + and target.context == context + and target.oxidation_state == oxidation_state + and target.location == location + ], + comment=f"Name has Damerau Levenshtein edit distance of 2 or lower with identical context, oxidation state, and location: {name}", + function_name="match_close_names", + match_condition=MatchCondition.related, + ) + ) + + return matches + + +def match_identical_names_lowercase( + source_flows: list[NormalizedFlow], + target_flows: list[NormalizedFlow], + function_name: str | None = None, + comment: str | None = None, + match_condition: MatchCondition | None = None, +) -> list: + """Match flows with identical names when compared in lowercase. + + This function matches flows where the normalized names are identical when + converted to lowercase, while still requiring exact matches on context, + oxidation state, and location. This handles cases where names differ only + in capitalization. + + Parameters + ---------- + source_flows : list[NormalizedFlow] + List of source flows to match. + target_flows : list[NormalizedFlow] + List of target flows to match against. + + Returns + ------- + list[Match] + List of Match objects with MatchCondition.close for flows with + identical lowercase names and identical context, oxidation state, + and location. + + Notes + ----- + - Names are compared in lowercase (case-insensitive) + - Context, oxidation state, and location must still match exactly + - Match condition is MatchCondition.close (not exact due to case differences) + - Only unit-compatible flows are matched + """ + matches = [] + + for (name, context, oxidation_state, location), sources in toolz.itertoolz.groupby( + lambda x: (x.name, x.context, x.oxidation_state, x.location), source_flows + ).items(): + name = name.lower() + matches.extend( + get_matches( + source_flows=sources, + target_flows=[ + flow + for flow in target_flows + if flow.name.lower() == name + and flow.context == context + and flow.oxidation_state == oxidation_state + and flow.location == location + ], + comment=comment + or f"Shared normalized lowercase name with identical context, oxidation state, and location: {name}", + function_name=function_name or "match_identical_names_lowercase", + match_condition=match_condition or MatchCondition.close, + ) + ) + + return matches + + +def match_identical_names_without_commas( + source_flows: list[NormalizedFlow], target_flows: list[NormalizedFlow] +) -> list: + """Match flows with identical names when commas are removed. + + This function matches flows where the normalized names are identical after + removing all commas, while still requiring exact matches on context, + oxidation state, and location. This handles cases where names differ only + in comma placement. + + Parameters + ---------- + source_flows : list[NormalizedFlow] + List of source flows to match. + target_flows : list[NormalizedFlow] + List of target flows to match against. + + Returns + ------- + list[Match] + List of Match objects with MatchCondition.close for flows with + identical names (after removing commas) and identical context, + oxidation state, and location. + + Notes + ----- + - All commas are removed from names before comparison + - Context, oxidation state, and location must still match exactly + - Match condition is MatchCondition.close (not exact due to comma differences) + - Only unit-compatible flows are matched + """ + matches = [] + + for (name, context, oxidation_state, location), sources in toolz.itertoolz.groupby( + lambda x: (x.name, x.context, x.oxidation_state, x.location), source_flows + ).items(): + matches.extend( + get_matches( + source_flows=sources, + target_flows=[ + flow + for flow in target_flows + if flow.name.replace(",", "") == name.replace(",", "") + and flow.context == context + and flow.oxidation_state == oxidation_state + and flow.location == location + ], + comment=f"Shared normalized name with commas removed and identical context, oxidation state, and location: {name}", + match_condition=MatchCondition.close, + function_name="match_identical_names_without_commas", + ) + ) + + return matches + + +is_uuid = re.compile( + r"^[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}$" +) + + +def match_identical_names_target_uuid_identifier( + source_flows: list[NormalizedFlow], + target_flows: list[NormalizedFlow], + function_name: str | None = None, + comment: str | None = None, + match_condition: MatchCondition | None = None, +) -> list: + """Match flows with identical normalized names, context, oxidation state, and location. + + This function is similar to `match_identical_names`, but with an additional + requirement that target flows must have a UUID identifier. This is used in cases + where the target flow list has two identical flows that we want to match to - normally we + reject a match with multiple options. Instead of fixing these manually, we prefer a target + flows with a UUID identifier. This is a hack, so only use this function as a last resort. + + Parameters + ---------- + source_flows : list[NormalizedFlow] + List of source flows to match. + target_flows : list[NormalizedFlow] + List of target flows to match against. Only flows with UUID identifiers + will be considered. + function_name : str | None, optional + Name of the matching function. Defaults to + "match_identical_names_target_uuid_identifier". + comment : str | None, optional + Comment to include in Match objects. Defaults to a description of the + shared attributes. + match_condition : MatchCondition | None, optional + Match condition to use. Defaults to MatchCondition.exact. + + Returns + ------- + list[Match] + List of Match objects with MatchCondition.exact (or specified condition) + for flows with identical normalized names, context, oxidation state, + and location, where the target flow has a UUID identifier. + + Notes + ----- + - All four attributes (name, context, oxidation_state, location) must match exactly + - Target flows must have a non-None identifier that matches the UUID format + - UUID format is validated using regex: 8-4-4-4-12 hexadecimal digits + - Names are compared after normalization + - Match condition defaults to MatchCondition.exact + - Only unit-compatible flows are matched (enforced by `get_matches`) + + Examples + -------- + >>> from flowmapper.domain.flow import Flow + >>> from flowmapper.domain.normalized_flow import NormalizedFlow + >>> from copy import copy + >>> + >>> source = Flow.from_dict({ + ... "name": "Carbon dioxide", + ... "context": "air", + ... "unit": "kg" + ... }) + >>> source_nf = NormalizedFlow( + ... original=source, + ... normalized=source.normalize(), + ... current=copy(source.normalize()) + ... ) + >>> + >>> target = Flow.from_dict({ + ... "name": "Carbon dioxide", + ... "context": "air", + ... "unit": "kg", + ... "identifier": "550e8400-e29b-41d4-a716-446655440000" # Valid UUID + ... }) + >>> target_nf = NormalizedFlow( + ... original=target, + ... normalized=target.normalize(), + ... current=copy(target.normalize()) + ... ) + >>> + >>> matches = match_identical_names_target_uuid_identifier( + ... source_flows=[source_nf], + ... target_flows=[target_nf] + ... ) + >>> len(matches) + 1 + """ + matches = [] + + for (name, context, oxidation_state, location), sources in toolz.itertoolz.groupby( + lambda x: (x.name, x.context, x.oxidation_state, x.location), source_flows + ).items(): + matches.extend( + get_matches( + source_flows=sources, + target_flows=[ + target + for target in target_flows + if target.name == name + and target.context == context + and target.oxidation_state == oxidation_state + and target.location == location + and target.identifier is not None + and is_uuid.match(target.identifier) + ], + comment=comment + or f"Shared normalized name with identical context, oxidation state, and location: {name}", + function_name=function_name + or "match_identical_names_target_uuid_identifier", + match_condition=match_condition or MatchCondition.exact, + ) + ) + + return matches diff --git a/src/flowmapper/matching/context.py b/src/flowmapper/matching/context.py new file mode 100644 index 0000000..36eb5e2 --- /dev/null +++ b/src/flowmapper/matching/context.py @@ -0,0 +1,139 @@ +"""Context-based matching functions. + +This module contains matching functions that match flows based on context +relationships. +""" + +from flowmapper.domain.match_condition import MatchCondition +from flowmapper.domain.normalized_flow import NormalizedFlow +from flowmapper.matching.core import get_matches +from flowmapper.utils import toolz + + +def match_resources_with_wrong_subcontext( + source_flows: list[NormalizedFlow], + target_flows: list[NormalizedFlow], + function_name: str | None = None, + comment: str | None = None, + match_condition: MatchCondition | None = None, +) -> list: + """Match resource flows ignoring subcontext differences. + + This function matches flows that are both resource-type flows (as + determined by `context.is_resource()`), have identical names, oxidation + states, and locations, but may have different subcontexts. This handles + cases where resource flows are categorized differently but represent the + same resource. + + Parameters + ---------- + source_flows : list[NormalizedFlow] + List of source flows to match. Only resource-type flows are considered. + target_flows : list[NormalizedFlow] + List of target flows to match against. Only resource-type flows are + considered. + + Returns + ------- + list[Match] + List of Match objects with MatchCondition.close for resource flows + with identical names, oxidation states, and locations, but potentially + different subcontexts. + + Notes + ----- + - Only flows where `normalized.context.is_resource()` returns True are matched + - Name, oxidation state, and location must match exactly + - Subcontext differences are ignored (both must be resource-type) + - Match condition is MatchCondition.close (not exact due to subcontext differences) + - Only unit-compatible flows are matched + """ + matches = [] + + for (name, oxidation_state, location), sources in toolz.itertoolz.groupby( + lambda x: (x.name, x.oxidation_state, x.location), + filter(lambda f: f.normalized.context.is_resource(), source_flows), + ).items(): + matches.extend( + get_matches( + source_flows=sources, + target_flows=[ + flow + for flow in target_flows + if flow.name == name + and flow.normalized.context.is_resource() + and flow.oxidation_state == oxidation_state + and flow.location == location + ], + comment=comment + or f"Shared normalized name and resource-type context, with identical oxidation state and location: {name}", + match_condition=match_condition or MatchCondition.close, + function_name=function_name or "match_resources_with_wrong_subcontext", + ) + ) + + return matches + + +def match_name_and_parent_context( + source_flows: list[NormalizedFlow], target_flows: list[NormalizedFlow] +) -> list: + """Match flows where target has parent context of source. + + This function matches flows where the source flow has a multi-level context + (e.g., ["emissions", "to air"]) and the target flow has the parent context + (e.g., ["emissions"]). This handles cases where flows are categorized at + different levels of specificity. + + Parameters + ---------- + source_flows : list[NormalizedFlow] + List of source flows to match. Only flows with multi-level contexts + (length > 1) are considered. + target_flows : list[NormalizedFlow] + List of target flows to match against. + + Returns + ------- + list[Match] + List of Match objects with MatchCondition.related for flows where the + target context is the parent of the source context. + + Notes + ----- + - Only source flows with contexts of length > 1 are considered + - Target context must exactly match the parent of source context (context[:-1]) + - Name, oxidation state, and location must match exactly + - Match condition is MatchCondition.related (not exact due to context differences) + - Only unit-compatible flows are matched + + Examples + -------- + >>> # Source: context=["emissions", "to air"] + >>> # Target: context=["emissions"] + >>> # These will match if name, oxidation_state, and location also match + """ + matches = [] + + for (name, oxidation_state, context, location), sources in toolz.itertoolz.groupby( + lambda x: (x.name, x.oxidation_state, x.context, x.location), + filter(lambda f: len(f.context) > 1, source_flows), + ).items(): + matches.extend( + get_matches( + source_flows=sources, + target_flows=[ + flow + for flow in target_flows + if flow.name == name + and flow.context == context[:-1] + and flow.oxidation_state == oxidation_state + and flow.location == location + ], + comment="Shared normalized name and parent context, with identical oxidation state and location", + match_condition=MatchCondition.related, + function_name="match_name_and_parent_context", + ) + ) + + return matches diff --git a/src/flowmapper/matching/core.py b/src/flowmapper/matching/core.py new file mode 100644 index 0000000..e057ff7 --- /dev/null +++ b/src/flowmapper/matching/core.py @@ -0,0 +1,258 @@ +"""Core matching utilities. + +This module contains core utility functions for matching flows, including +transformation and filtering support. +""" + +import itertools +from collections.abc import Callable + +from flowmapper.domain.match import Match +from flowmapper.domain.match_condition import MatchCondition +from flowmapper.domain.normalized_flow import NormalizedFlow + + +def transform_and_then_match( + source_flows: list[NormalizedFlow], + target_flows: list[NormalizedFlow], + match_function: Callable, + transform_source_flows: list[Callable] | None = None, + transform_target_flows: list[Callable] | None = None, + filter_source_flows: Callable | None = None, + filter_target_flows: Callable | None = None, +) -> list[Match]: + """Apply transformations and filters to flows, then match them. + + This function provides a flexible way to apply transformations and filters + to source and target flows before matching, while ensuring all flows are + reset to their normalized state after matching completes. + + The function applies transformations and filters in the following order: + 1. Transform source flows (if provided) - applies all transformations in sequence + 2. Filter source flows (if provided) + 3. Transform target flows (if provided) - applies all transformations in sequence + 4. Filter target flows (if provided) + 5. Call match function with filtered flows + 6. Reset all flows to normalized state + + Parameters + ---------- + source_flows : list[NormalizedFlow] + List of source flows to match. + target_flows : list[NormalizedFlow] + List of target flows to match against. + match_function : Callable + Function that performs the actual matching. Must accept keyword arguments + `source_flows` and `target_flows` (both lists of NormalizedFlow) and return + a list of Match objects. + transform_source_flows : list[Callable[[list[NormalizedFlow]], list[NormalizedFlow]]] | None + Optional list of functions to transform source flows. Functions are applied + in sequence. Each function takes a list of NormalizedFlow objects and returns + a modified list. Functions should modify flows in place (e.g., using + update_current) and return the same list. + transform_target_flows : list[Callable[[list[NormalizedFlow]], list[NormalizedFlow]]] | None + Optional list of functions to transform target flows. Functions are applied + in sequence. Each function takes a list of NormalizedFlow objects and returns + a modified list. Functions should modify flows in place (e.g., using + update_current) and return the same list. + filter_source_flows : Callable[[list[NormalizedFlow]], list[NormalizedFlow]] | None + Optional function to filter source flows. Takes a list of NormalizedFlow objects + and returns a filtered list (may be shorter than input). + filter_target_flows : Callable[[list[NormalizedFlow]], list[NormalizedFlow]] | None + Optional function to filter target flows. Takes a list of NormalizedFlow objects + and returns a filtered list (may be shorter than input). + + Returns + ------- + list[Match] + List of Match objects found by the match function. + + Examples + -------- + >>> from flowmapper.matching import match_identical_names, transform_and_then_match + >>> from flowmapper.utils import apply_randonneur + >>> from functools import partial + >>> + >>> # Transform flows with a single function (wrap in list) + >>> transform_func = partial( + ... apply_randonneur, + ... datapackage="some-transformation", + ... fields=["name", "context"] + ... ) + >>> + >>> matches = transform_and_then_match( + ... source_flows=source_flows, + ... target_flows=target_flows, + ... match_function=match_identical_names, + ... transform_source_flows=[transform_func], + ... transform_target_flows=[transform_func] + ... ) + >>> + >>> # Transform flows with multiple functions in sequence + >>> transform1 = partial(apply_randonneur, datapackage="transformation-1", fields=["name"]) + >>> transform2 = partial(apply_randonneur, datapackage="transformation-2", fields=["context"]) + >>> + >>> matches = transform_and_then_match( + ... source_flows=source_flows, + ... target_flows=target_flows, + ... match_function=match_identical_names, + ... transform_source_flows=[transform1, transform2], + ... transform_target_flows=[transform1, transform2] + ... ) + >>> + >>> # Filter flows before matching + >>> def filter_resources(flows): + ... return [f for f in flows if f.normalized.context.is_resource()] + >>> + >>> matches = transform_and_then_match( + ... source_flows=source_flows, + ... target_flows=target_flows, + ... match_function=match_identical_names, + ... filter_source_flows=filter_resources, + ... filter_target_flows=filter_resources + ... ) + + Notes + ----- + - All flows (both source and target) are automatically reset to their normalized + state after matching completes successfully. If the match function raises an + exception, flows will not be reset. + - When multiple transformations are provided in a list, they are applied in + sequence. The output of each transformation becomes the input to the next. + - To apply a single transformation, wrap it in a list: `[transform_func]` + """ + # Apply source flow transformations + if transform_source_flows is None: + transformed_source_flows = source_flows + else: + # Apply multiple transformations in sequence + transformed_source_flows = source_flows + for transform_func in transform_source_flows: + transformed_source_flows = transform_func(transformed_source_flows) + + # Apply source flow filters + filtered_source_flows = ( + filter_source_flows(transformed_source_flows) + if filter_source_flows + else transformed_source_flows + ) + + # Apply target flow transformations + if transform_target_flows is None: + transformed_target_flows = target_flows + else: + # Apply multiple transformations in sequence + transformed_target_flows = target_flows + for transform_func in transform_target_flows: + transformed_target_flows = transform_func(transformed_target_flows) + + # Apply target flow filters + filtered_target_flows = ( + filter_target_flows(transformed_target_flows) + if filter_target_flows + else transformed_target_flows + ) + + matches = match_function( + source_flows=filtered_source_flows, target_flows=filtered_target_flows + ) + + for flow in itertools.chain(source_flows, target_flows): + flow.reset_current() + + return matches + + +def get_matches( + source_flows: list[NormalizedFlow], + target_flows: list[NormalizedFlow], + comment: str, + function_name: str, + match_condition: MatchCondition, +) -> list[Match]: + """Create Match objects from source and target flows. + + This is a helper function used by various matching functions to create + Match objects with proper unit compatibility checking and conversion + factor calculation. It handles the common logic of: + - Filtering target flows by unit compatibility + - Resolving multiple target matches by context matching + - Calculating conversion factors + - Marking source flows as matched + + Parameters + ---------- + source_flows : list[NormalizedFlow] + List of source flows to match. Each source flow will be matched + against compatible target flows. + target_flows : list[NormalizedFlow] + List of target flows to match against. Only unit-compatible flows + are considered. + comment : str + Comment to include in each Match object describing the match. + function_name : str + Name of the matching function that created this match (e.g., + "match_identical_names"). + match_condition : MatchCondition + The match quality condition (exact, close, related, etc.). + + Returns + ------- + list[Match] + List of Match objects. Each Match represents a successful match + between a source flow and a target flow. + + Notes + ----- + - Only unit-compatible flows are matched (checked via `unit_compatible()`) + - If multiple target flows are unit-compatible, the function tries to + find the most appropriate match by matching normalized contexts + - If exactly one target flow matches after context filtering, a Match + is created and the source flow is marked as matched + - Conversion factors are calculated automatically using + `source.conversion_factor(target)` which accounts for both unit + conversion and any transformation factors + - The function only creates matches when there is exactly one target + flow remaining after filtering + + Examples + -------- + >>> matches = get_matches( + ... source_flows=[source_flow], + ... target_flows=[target_flow1, target_flow2], + ... comment="Shared identifier", + ... function_name="match_identical_identifier", + ... match_condition=MatchCondition.exact + ... ) + """ + if not target_flows: + return [] + + matches = [] + + for source in source_flows: + targets = [flow for flow in target_flows if source.unit_compatible(flow)] + if len(targets) > 1: + # Try find most-appropriate match if more than one is present. Added because ecoinvent + # deprecated most stratospheric emissions and redirected them to air, unspecified, so + # now all air, unspecified emissions have multiple targets. + targets = [ + target + for target in targets + if target.normalized.context == source.normalized.context + ] + if len(targets) == 1: + target = target_flows[0] + source.matched = True + matches.append( + Match( + source=source.original, + target=target.original, + function_name=function_name, + comment=comment or "", + condition=match_condition, + conversion_factor=source.conversion_factor(target), + ) + ) + + return matches diff --git a/src/flowmapper/matching/ecoinvent.py b/src/flowmapper/matching/ecoinvent.py new file mode 100644 index 0000000..5582d57 --- /dev/null +++ b/src/flowmapper/matching/ecoinvent.py @@ -0,0 +1,31 @@ +from functools import partial + +from flowmapper.domain.match_condition import MatchCondition +from flowmapper.matching.basic import match_identical_names +from flowmapper.matching.core import transform_and_then_match +from flowmapper.utils import apply_randonneur + +match_ecoinvent_transitive_matching = partial( + transform_and_then_match, + match_function=partial( + match_identical_names, + function_name="match_ecoinvent_transitive_matching", + comment="Shared normalized attributes after applying transformation: ecoinvent-2.2-biosphere-ecoinvent-3.12-biosphere-transitive", + match_condition=MatchCondition.close, + ), + transform_source_flows=[ + partial( + apply_randonneur, + datapackage="ecoinvent-2.2-biosphere-ecoinvent-3.12-biosphere-transitive", + fields=["name", "context"], + ) + ], + transform_target_flows=[ + partial( + apply_randonneur, + datapackage="ecoinvent-2.2-biosphere-ecoinvent-3.12-biosphere-transitive", + fields=["name", "context"], + ) + ], +) +match_ecoinvent_transitive_matching.__name__ = "match_ecoinvent_transitive_matching" diff --git a/src/flowmapper/matching/rules.py b/src/flowmapper/matching/rules.py new file mode 100644 index 0000000..98f40c1 --- /dev/null +++ b/src/flowmapper/matching/rules.py @@ -0,0 +1,86 @@ +"""Matching rules configuration. + +This module provides the default set of matching rules used by Flowmap. +""" + +from flowmapper.matching.basic import ( + match_identical_cas_numbers, + match_identical_identifier, + match_identical_names, + match_identical_names_target_uuid_identifier, + match_identical_names_without_commas, +) +from flowmapper.matching.context import ( + match_name_and_parent_context, + match_resources_with_wrong_subcontext, +) +from flowmapper.matching.ecoinvent import match_ecoinvent_transitive_matching +from flowmapper.matching.simapro import ( + manual_simapro_ecoinvent_mapping, + manual_simapro_ecoinvent_mapping_add_regionalized_flows, + manual_simapro_ecoinvent_mapping_resource_wrong_subcontext, + simapro_ecoinvent_glad_name_matching, +) +from flowmapper.matching.specialized import ( + add_missing_regionalized_flows, + match_names_with_suffix_removal, +) + + +def match_rules(): + """Return the default list of matching functions. + + This function returns the default ordered list of matching functions + used by Flowmap. The functions are applied in order, and matching + stops once a flow is successfully matched. + + Returns + ------- + list[Callable] + List of matching functions to apply in order. Each function must + accept `source_flows` and `target_flows` keyword arguments and + return a list of Match objects. + + Notes + ----- + - Functions are applied in order from most specific to least specific + - Once a flow is matched, it is not considered by subsequent functions + - Some functions are commented out and not included in the default rules + - The list includes a specialized transformation for SimaPro 2024 to + ecoinvent 3.10 biosphere matching + + Examples + -------- + >>> rules = match_rules() + >>> for rule in rules: + ... matches = rule(source_flows=source, target_flows=target) + ... # Process matches... + """ + return [ + match_identical_identifier, + match_identical_names, + match_identical_names_without_commas, + match_resources_with_wrong_subcontext, + match_name_and_parent_context, + match_identical_cas_numbers, + match_names_with_suffix_removal, + ] + + +def match_rules_simapro_ecoinvent(): + return [ + match_identical_identifier, + match_identical_names, + match_identical_names_without_commas, + match_ecoinvent_transitive_matching, + match_resources_with_wrong_subcontext, + match_name_and_parent_context, + manual_simapro_ecoinvent_mapping, + simapro_ecoinvent_glad_name_matching, + manual_simapro_ecoinvent_mapping_add_regionalized_flows, + manual_simapro_ecoinvent_mapping_resource_wrong_subcontext, + add_missing_regionalized_flows, + match_identical_cas_numbers, + match_identical_names_target_uuid_identifier, + match_names_with_suffix_removal, + ] diff --git a/src/flowmapper/matching/simapro.py b/src/flowmapper/matching/simapro.py new file mode 100644 index 0000000..24e2b8d --- /dev/null +++ b/src/flowmapper/matching/simapro.py @@ -0,0 +1,121 @@ +from functools import partial + +from randonneur_data import Registry + +from flowmapper.domain.match_condition import MatchCondition +from flowmapper.matching import match_identical_names_lowercase +from flowmapper.matching.context import match_resources_with_wrong_subcontext +from flowmapper.matching.core import transform_and_then_match +from flowmapper.matching.specialized import add_missing_regionalized_flows +from flowmapper.utils import apply_randonneur + +manual_simapro_ecoinvent_mapping = partial( + transform_and_then_match, + match_function=partial( + match_identical_names_lowercase, + function_name="manual_simapro_ecoinvent_mapping", + comment="Shared normalized attributes after applying transformation: simapro-2024-biosphere-ecoinvent-3.10-biosphere", + match_condition=MatchCondition.related, + ), + transform_source_flows=[ + partial( + apply_randonneur, + datapackage="simapro-2024-biosphere-ecoinvent-3.10-biosphere", + fields=["name", "unit"], + ) + ], +) +manual_simapro_ecoinvent_mapping.__name__ = "manual_simapro_ecoinvent_mapping" + + +manual_simapro_ecoinvent_mapping_add_regionalized_flows = partial( + transform_and_then_match, + match_function=partial( + add_missing_regionalized_flows, + function_name="manual_simapro_ecoinvent_mapping_add_regionalized_flows", + ), + transform_source_flows=[ + partial( + apply_randonneur, + datapackage="simapro-2024-biosphere-ecoinvent-3.10-biosphere", + fields=["name", "unit"], + ) + ], +) +manual_simapro_ecoinvent_mapping_add_regionalized_flows.__name__ = ( + "manual_simapro_ecoinvent_mapping_add_regionalized_flows" +) + + +manual_simapro_ecoinvent_mapping_resource_wrong_subcontext = partial( + transform_and_then_match, + match_function=partial( + match_resources_with_wrong_subcontext, + function_name="manual_simapro_ecoinvent_mapping_resource_wrong_subcontext", + ), + transform_source_flows=[ + partial( + apply_randonneur, + datapackage="simapro-2024-biosphere-ecoinvent-3.10-biosphere", + fields=["name", "unit"], + ) + ], +) +manual_simapro_ecoinvent_mapping_resource_wrong_subcontext.__name__ = ( + "manual_simapro_ecoinvent_mapping_resource_wrong_subcontext" +) + + +def _get_normalized_matching() -> dict: + registry = Registry() + + context_mapping = { + line["source"]["context"]: line["target"]["context"] + for line in registry.get_file("SimaPro-2025-ecoinvent-3.12-context")["update"] + } + + dp = registry.get_file( + "simapro-2025-biosphere-ef-3.1-biosphere-ecoinvent-3.12-biosphere-transitive" + ) + + # for row in dp["update"]: + # if row["source"]["name"] == "Particulates, > 10 um" and row["source"]["context"].startswith("Air"): + # print(row) + + # print() + + # Remove indoor mappings - these were deleted from ecoinvent, so map to other subcontexts. + # However, there is no guarantee that they will have the _same_ mapping in that subcontext + # as the other, existing mapping, and multiple conflicting mappings will raise an error. + dp["update"] = [ + row for row in dp["update"] if not row["source"]["context"].endswith("indoor") + ] + + for row in dp["update"]: + # Our source flows are already normalized to this form + row["source"]["context"] = context_mapping[row["source"]["context"]] + + # for row in dp["update"]: + # if row["source"]["name"] == "Particulates, > 10 um" and row["source"]["context"][0] == "air": + # print(row) + + return dp + + +simapro_ecoinvent_glad_name_matching = partial( + transform_and_then_match, + transform_source_flows=[ + partial( + apply_randonneur, + datapackage=_get_normalized_matching(), + fields=["name", "context"], + ) + ], + match_function=partial( + match_identical_names_lowercase, + function_name="simapro_ecoinvent_glad_name_matching", + comment="Shared normalized attributes after applying transformation: simapro-2025-biosphere-ef-3.1-biosphere-ecoinvent-3.12-biosphere-transitive", + match_condition=MatchCondition.related, + ), +) +simapro_ecoinvent_glad_name_matching.__name__ = "simapro_ecoinvent_glad_name_matching" diff --git a/src/flowmapper/matching/specialized.py b/src/flowmapper/matching/specialized.py new file mode 100644 index 0000000..fe13064 --- /dev/null +++ b/src/flowmapper/matching/specialized.py @@ -0,0 +1,330 @@ +"""Specialized matching functions. + +This module contains specialized matching functions for specific use cases +like regionalized flows and suffix matching. +""" + +from flowmapper.domain.match import Match +from flowmapper.domain.match_condition import MatchCondition +from flowmapper.domain.normalized_flow import NormalizedFlow +from flowmapper.utils import toolz + + +def add_missing_regionalized_flows( + source_flows: list[NormalizedFlow], + target_flows: list[NormalizedFlow], + function_name: str | None = None, + comment: str | None = None, + match_condition: MatchCondition | None = None, +) -> list[Match]: + """Add missing regionalized flows based on existing regionalized flows. + + If a source flow has a location and there are target flows with the same + name, context, and oxidation state but different locations, create a new + target flow for the source location. + + The function groups source flows by (name, oxidation_state, context, location) + and for each group: + - If there are other regionalized target flows (same name/context/oxidation_state + but different location), uses the first one as a template + - Otherwise, if there is exactly one non-regionalized target flow (same + name/context/oxidation_state but no location), uses that as a template + - Creates a new target flow by copying the template and setting the source's + location using `copy_with_new_location` + + Parameters + ---------- + source_flows : list[NormalizedFlow] + List of source flows to match. Only flows with a location are considered. + target_flows : list[NormalizedFlow] + List of target flows to match against. + function_name : str | None, optional + Name of the matching function (currently not used, defaults to + "add_missing_regionalized_flows"). + comment : str | None, optional + Comment for matches (currently not used, defaults to a description of + the new target flow). + match_condition : MatchCondition | None, optional + Match condition (currently not used, defaults to MatchCondition.related). + + Returns + ------- + list[Match] + List of Match objects with new_target_flow=True. Each match represents + a source flow matched to a newly created target flow. + + Notes + ----- + - Only source flows with a location are considered + - Target flows must be unit-compatible with source flows to create matches + - The new target flow is created using `copy_with_new_location`, which sets + a new UUID identifier + - All matches are created with `MatchCondition.related` and + `new_target_flow=True` + + Examples + -------- + >>> source = NormalizedFlow.from_dict({ + ... "name": "Carbon dioxide, NL", + ... "context": "air", + ... "unit": "kg" + ... }) + >>> target = NormalizedFlow.from_dict({ + ... "name": "Carbon dioxide, DE", + ... "context": "air", + ... "unit": "kg" + ... }) + >>> matches = add_missing_regionalized_flows( + ... source_flows=[source], + ... target_flows=[target] + ... ) + >>> len(matches) + 1 + >>> matches[0].new_target_flow + True + """ + matches = [] + + for (name, oxidation_state, context, location), sources in toolz.itertoolz.groupby( + lambda x: (x.name, x.oxidation_state, x.context, x.location), + filter(lambda x: x.location, source_flows), + ).items(): + other_regions = [ + flow + for flow in target_flows + if flow.name == name + and flow.context == context + and flow.oxidation_state == oxidation_state + and flow.location + and flow.location != location + ] + non_regionalized = [ + flow + for flow in target_flows + if flow.name == name + and flow.context == context + and flow.oxidation_state == oxidation_state + and flow.location is None + ] + + if other_regions: + target = other_regions[0] + + for source in sources: + if source.unit_compatible(target): + source.matched = True + matches.append( + Match( + source=source.original, + target=target.original.copy_with_new_location( + location=location + ), + function_name="add_missing_regionalized_flows", + comment=f"Added new target flow for location {location}, with shared name, context, and oxidation state", + condition=MatchCondition.related, + conversion_factor=source.conversion_factor(target), + new_target_flow=True, + ) + ) + elif len(non_regionalized) == 1: + target = non_regionalized[0] + + for source in sources: + if source.unit_compatible(target): + source.matched = True + matches.append( + Match( + source=source.original, + target=target.original.copy_with_new_location( + location=location + ), + function_name="add_missing_regionalized_flows", + comment=f"Added new target flow for location {location}, with shared name, context, and oxidation state", + condition=MatchCondition.related, + conversion_factor=source.conversion_factor(target), + new_target_flow=True, + ) + ) + + return matches + + +def equivalent_names(a: str, b: str) -> bool: + """Check if two flow names are equivalent after removing certain suffixes. + + This function determines if two flow names represent the same substance by + checking if they differ only by specific suffixes that don't change the + fundamental identity of the flow. It handles two types of equivalences: + + 1. **Suffix removal**: Names are equivalent if one has a suffix and the + other doesn't, but the base names match. Supported suffixes: + - ", in ground" + - ", ion" + - ", in air" + - ", in water" + - ", unspecified origin" + + 2. **Biogenic/non-fossil equivalence**: Names ending with ", biogenic" and + ", non-fossil" are considered equivalent if the base names match. + + Parameters + ---------- + a : str + First flow name to compare. + b : str + Second flow name to compare. + + Returns + ------- + bool + True if the names are equivalent (differ only by supported suffixes), + False otherwise. + + Notes + ----- + - The function is case-sensitive for the base name comparison + - Suffix matching is exact (must match the full suffix string) + - For biogenic/non-fossil equivalence, the base names must match exactly + after removing the respective suffixes (10 chars for ", biogenic" and + 12 chars for ", non-fossil") + - The ", ion" suffix is safe to ignore because matching functions also + check for matching oxidation states, ensuring correct matching + + Examples + -------- + >>> equivalent_names("Carbon dioxide, in air", "Carbon dioxide") + True + >>> equivalent_names("Carbon dioxide", "Carbon dioxide, in air") + True + >>> equivalent_names("Carbon dioxide, in ground", "Carbon dioxide, in air") + False + >>> equivalent_names("Methane, biogenic", "Methane, non-fossil") + True + >>> equivalent_names("Carbon dioxide, ion", "Carbon dioxide") + True + >>> equivalent_names("Carbon dioxide", "Carbon monoxide") + False + """ + suffixes = [ + ", in ground", + ", ion", # OK because we still check for single match and matching oxidation state + ", in air", + ", in water", + ", unspecified origin", + ] + for suffix in suffixes: + if a.endswith(suffix) and not b.endswith(suffix) and a[: -len(suffix)] == b: + return True + if b.endswith(suffix) and not a.endswith(suffix) and b[: -len(suffix)] == a: + return True + if a.endswith(", biogenic") and b.endswith(", non-fossil") and a[:-10] == b[:-12]: + return True + if b.endswith(", biogenic") and a.endswith(", non-fossil") and b[:-10] == a[:-12]: + return True + return False + + +def match_names_with_suffix_removal( + source_flows: list[NormalizedFlow], + target_flows: list[NormalizedFlow], + function_name: str | None = None, + comment: str | None = None, + match_condition: MatchCondition | None = None, +) -> list[Match]: + """Match flows where names are equivalent after removing certain suffixes. + + This function matches source and target flows where the names are considered + equivalent by `equivalent_names`, meaning they differ only by supported + suffixes (e.g., ", in air", ", in ground", ", ion", ", biogenic"/", non-fossil"). + In addition to name equivalence, flows must also have matching: + - Context + - Oxidation state + - Location + + The function groups source flows by (name, context, oxidation_state, location) + and for each group, finds target flows with equivalent names (using + `equivalent_names`) and matching attributes. + + Parameters + ---------- + source_flows : list[NormalizedFlow] + List of source flows to match. Flows are grouped by name, context, + oxidation state, and location. + target_flows : list[NormalizedFlow] + List of target flows to match against. Only flows with equivalent names + and matching attributes are considered. + function_name : str | None, optional + Name of the matching function. Defaults to "match_names_with_suffix_removal". + comment : str | None, optional + Comment for matches. Defaults to a descriptive string about suffix removal. + match_condition : MatchCondition | None, optional + The match quality condition. Defaults to MatchCondition.close. + + Returns + ------- + list[Match] + List of Match objects representing successful matches. Each match has + a source flow and target flow with equivalent names (after suffix removal) + and matching context, oxidation state, and location. + + Notes + ----- + - Names are compared in lowercase for matching + - Only unit-compatible flows are matched (handled by `get_matches`) + - The function uses `equivalent_names` to determine name equivalence + - Supported suffixes include: ", in ground", ", ion", ", in air", ", in water", + ", unspecified origin", and the biogenic/non-fossil pair + - If multiple target flows match, `get_matches` handles resolution based on + context matching + + Examples + -------- + >>> from flowmapper.domain.normalized_flow import NormalizedFlow + >>> from flowmapper.matching.specialized import match_names_with_suffix_removal + >>> + >>> source = NormalizedFlow.from_dict({ + ... "name": "Carbon dioxide, in air", + ... "context": "air", + ... "unit": "kg" + ... }) + >>> target = NormalizedFlow.from_dict({ + ... "name": "Carbon dioxide", + ... "context": "air", + ... "unit": "kg" + ... }) + >>> matches = match_names_with_suffix_removal( + ... source_flows=[source], + ... target_flows=[target] + ... ) + >>> len(matches) + 1 + >>> matches[0].condition + MatchCondition.close + """ + from flowmapper.matching.core import get_matches + + matches = [] + + for (name, context, oxidation_state, location), sources in toolz.itertoolz.groupby( + lambda x: (x.name, x.context, x.oxidation_state, x.location), source_flows + ).items(): + name = name.lower() + matches.extend( + get_matches( + source_flows=sources, + target_flows=[ + flow + for flow in target_flows + if equivalent_names(name, flow.name.lower()) + and flow.context == context + and flow.oxidation_state == oxidation_state + and flow.location == location + ], + comment=comment + or f"Shared normalized lowercase name with suffix removed and identical context, oxidation state, and location: {name}", + function_name=function_name or "match_names_with_suffix_removal", + match_condition=match_condition or MatchCondition.close, + ) + ) + + return matches diff --git a/flowmapper/preferred_synonyms.py b/src/flowmapper/preferred_synonyms.py similarity index 57% rename from flowmapper/preferred_synonyms.py rename to src/flowmapper/preferred_synonyms.py index c678fd9..c3c2b01 100644 --- a/flowmapper/preferred_synonyms.py +++ b/src/flowmapper/preferred_synonyms.py @@ -1,6 +1,6 @@ import re -from flowmapper.flow import Flow +from flowmapper.domain.flow import Flow ROMAN_NUMERAL_PATTERN = re.compile(r"\b\(?[ivx]+[\+-]?\)?\s*$", flags=re.IGNORECASE) PARENTHESES_PATTERN = re.compile(r"\([1-9]+[\+-]?\)\s*$") @@ -35,24 +35,48 @@ def has_number_pattern_at_end(text: str) -> bool: def match_identical_names_in_preferred_synonyms( - s: Flow, t: Flow, comment: str = "Identical preferred synonyms" + source_flows: list[Flow], + target_flows: list[Flow], + comment: str = "Identical preferred synonyms", ): if t.synonyms and s.name in t.synonyms and s.context == t.context: if s.name.normalized in t.name.normalized and ( has_roman_numeral_at_end(t.name.normalized) or has_number_pattern_at_end(t.name.normalized) ): + # Check if there's another target flow with a different name that shares the same synonym + for other_target in all_target_flows: + if ( + other_target is not t + and other_target.name.normalized != t.name.normalized + and other_target.synonyms + and s.name in other_target.synonyms + and other_target.context == s.context + ): + return None return {"comment": comment} elif s.synonyms and t.name in s.synonyms and s.context == t.context: if t.name.normalized in s.name.normalized and ( has_roman_numeral_at_end(s.name.normalized) or has_number_pattern_at_end(s.name.normalized) ): + # Check if there's another target flow with a different name that shares the same synonym + for other_target in all_target_flows: + if ( + other_target is not t + and other_target.name.normalized != t.name.normalized + and other_target.synonyms + and t.name in other_target.synonyms + and other_target.context == s.context + ): + return None return {"comment": comment} def match_identical_names_in_synonyms( - s: Flow, t: Flow, comment: str = "Identical synonyms" + source_flows: list[Flow], + target_flows: list[Flow], + comment: str = "Identical synonyms", ): if (t.synonyms and s.name in t.synonyms and s.context == t.context) or ( s.synonyms and t.name in s.synonyms and s.context == t.context diff --git a/src/flowmapper/unit.py b/src/flowmapper/unit.py new file mode 100644 index 0000000..95e4746 --- /dev/null +++ b/src/flowmapper/unit.py @@ -0,0 +1,66 @@ +import importlib.resources as resource +import json +import math +from collections import UserString +from pathlib import Path +from typing import Any, Self + +from pint import UnitRegistry, errors + +from flowmapper.utils import normalize_str + +ureg = UnitRegistry() + +with resource.as_file(resource.files("flowmapper") / "data" / "units.txt") as filepath: + ureg.load_definitions(filepath) + +with open(Path(__file__).parent / "data" / "standard-units-harmonization.json") as f: + UNIT_MAPPING = { + line["source"]["unit"]: line["target"]["unit"] + for line in json.load(f)["update"] + } + + +class UnitField(UserString): + def normalize(self) -> Self: + """Normalize string to fit into our `pint` definitions""" + label = normalize_str(self.data) + if label in UNIT_MAPPING: + label = UNIT_MAPPING[label] + try: + ureg(label) + except errors.UndefinedUnitError: + raise ValueError( + f"Unit {label} is unknown; add to flowmapper `units.txt` or define a mapping in `unit-mapping.json`" + ) + # Makes type checkers happy, if inelegant... + return type(self)(label) + + def is_uri(self, value: str) -> bool: + # Placeholder for when we support glossary entries + return False + + def resolve_uri(self, uri: str) -> None: + # Placeholder + pass + + def __eq__(self, other: Any) -> bool: + if isinstance(other, UnitField): + return self.data == other.data or self.conversion_factor(other) == 1 + else: + return self.data == other + + def compatible(self, other: Any) -> bool: + return math.isfinite(self.conversion_factor(other)) + + def conversion_factor(self, to: Any) -> float: + if not isinstance(to, (UnitField, str)): + result = float("nan") + elif isinstance(to, UnitField) and self.data == to.data: + result = 1.0 + else: + try: + result = ureg(self.data).to(ureg(str(to))).magnitude + except (errors.DimensionalityError, errors.UndefinedUnitError): + result = float("nan") + return result diff --git a/src/flowmapper/utils/__init__.py b/src/flowmapper/utils/__init__.py new file mode 100644 index 0000000..c030053 --- /dev/null +++ b/src/flowmapper/utils/__init__.py @@ -0,0 +1,57 @@ +"""Utility functions for flowmapper. + +This package contains utility functions organized by functionality: +- context: Context-related utilities +- strings: String manipulation utilities +- flow_names: Flow name processing +- randonneur: Randonneur-based transformations +- files: File I/O utilities +- constants: Shared constants and data +""" + +from flowmapper.utils.constants import ( + RESULTS_DIR, + default_registry, + logger, + names_and_locations, + toolz, +) +from flowmapper.utils.context import ( + MISSING_VALUES, + as_normalized_tuple, + tupleize_context, +) +from flowmapper.utils.files import load_standard_transformations, read_migration_files +from flowmapper.utils.flow_names import remove_unit_slash, unit_slash +from flowmapper.utils.randonneur import ( + apply_randonneur, + apply_transformation_and_convert_flows_to_normalized_flows, + randonneur_as_function, +) +from flowmapper.utils.strings import normalize_str, rowercase + +__all__ = [ + # Constants + "RESULTS_DIR", + "default_registry", + "logger", + "names_and_locations", + "toolz", + # Context + "MISSING_VALUES", + "as_normalized_tuple", + "tupleize_context", + # Strings + "normalize_str", + "rowercase", + # Flow names + "remove_unit_slash", + "unit_slash", + # Randonneur + "apply_transformation_and_convert_flows_to_normalized_flows", + "apply_randonneur", + "randonneur_as_function", + # Files + "load_standard_transformations", + "read_migration_files", +] diff --git a/src/flowmapper/utils/constants.py b/src/flowmapper/utils/constants.py new file mode 100644 index 0000000..6a83165 --- /dev/null +++ b/src/flowmapper/utils/constants.py @@ -0,0 +1,25 @@ +"""Shared constants and data for flowmapper utilities.""" + +import importlib.resources as resource +import json +from pathlib import Path + +import structlog +from randonneur_data import Registry + +logger = structlog.get_logger("flowmapper") +default_registry = Registry() +RESULTS_DIR = Path(__file__).parent.parent / "manual_matching" / "results" + +with resource.as_file( + resource.files("flowmapper") / "data" / "names_and_locations.json" +) as filepath: + names_and_locations = {o["source"]: o for o in json.load(open(filepath))} + +try: + import cytoolz as toolz +except ImportError: + logger.info("Install `cytoolz` to get a speed up in matching functions") + import toolz + +assert toolz # Do not delete the import call stupid linter diff --git a/src/flowmapper/utils/context.py b/src/flowmapper/utils/context.py new file mode 100644 index 0000000..191e4f2 --- /dev/null +++ b/src/flowmapper/utils/context.py @@ -0,0 +1,72 @@ +"""Context-related utility functions.""" + +from typing import Any + +MISSING_VALUES = { + "", + "(unknown)", + "(unspecified)", + "null", + "unknown", + "unspecified", +} + + +def as_normalized_tuple(value: Any) -> tuple[str]: + """Convert context inputs to normalized tuple form.""" + if isinstance(value, (tuple, list)): + intermediate = value + elif isinstance(value, str) and "/" in value: + intermediate = list(value.split("/")) + elif isinstance(value, str): + intermediate = [value] + else: + raise ValueError(f"Can't understand input context {value}") + + intermediate = [elem.lower().strip() for elem in intermediate] + + while intermediate and intermediate[-1] in MISSING_VALUES: + if len(intermediate) == 1: + break + intermediate = intermediate[:-1] + + return tuple(intermediate) + + +def tupleize_context(obj: dict) -> dict: + """Convert `context` value to `tuple` if possible. + + Handles both individual migration objects and full datapackage structures. + For datapackages, iterates through verb keys (like "update", "create") and + processes all migration objects in those lists. + """ + # Handle datapackage structure with verb keys (update, create, etc.) + if isinstance(obj, dict): + # Check if this looks like a datapackage (has verb keys with lists) + verb_keys = ["update", "create", "delete", "rename"] + has_verb_keys = any( + key in obj and isinstance(obj[key], list) for key in verb_keys + ) + + if has_verb_keys: + # This is a datapackage - process each verb's list + for verb in verb_keys: + if verb in obj and isinstance(obj[verb], list): + for migration_obj in obj[verb]: + if isinstance(migration_obj, dict): + tupleize_context(migration_obj) + return obj + + # Handle individual migration object or dict with context + if isinstance(obj, dict): + # Process top-level context if present + if "context" in obj and not isinstance(obj["context"], str): + obj["context"] = as_normalized_tuple(obj["context"]) + + # Recursively process source and target + if isinstance(obj.get("source"), dict): + tupleize_context(obj["source"]) + if isinstance(obj.get("target"), dict): + tupleize_context(obj["target"]) + + return obj diff --git a/src/flowmapper/utils/files.py b/src/flowmapper/utils/files.py new file mode 100644 index 0000000..d135101 --- /dev/null +++ b/src/flowmapper/utils/files.py @@ -0,0 +1,51 @@ +"""File I/O utility functions.""" + +import importlib.resources as resource +import json +from pathlib import Path + +from flowmapper.utils.constants import RESULTS_DIR + + +def load_standard_transformations() -> list: + """Load standard transformation files.""" + # with resource.as_file( + # resource.files("flowmapper") / "data" / "standard-units-harmonization.json" + # ) as filepath: + # units = json.load(open(filepath)) + with resource.as_file( + resource.files("flowmapper") / "data" / "simapro-2023-ecoinvent-3-contexts.json" + ) as filepath: + contexts = json.load(open(filepath)) + # return [units, contexts] + return [contexts] + + +def read_migration_files(*filepaths: str | Path) -> list[dict]: + """ + Read and aggregate migration data from multiple JSON files. + + This function opens and reads a series of JSON files, each containing migration data as a list of dicts without the change type. + It aggregates all changes into a single list and returns it wrapped in a dictionary + under the change type 'update'. + + Parameters + ---------- + *filepaths : Path + Variable length argument list of Path objects. + + Returns + ------- + dict + A dictionary containing a single key 'update', which maps to a list. This list is + an aggregation of the data from all the JSON files read. + """ + migration_data = [] + + for filepath in filepaths: + if (RESULTS_DIR / filepath).is_file(): + filepath = RESULTS_DIR / filepath + with open(Path(filepath)) as fs: + migration_data.append(json.load(fs)) + + return migration_data diff --git a/src/flowmapper/utils/flow_names.py b/src/flowmapper/utils/flow_names.py new file mode 100644 index 0000000..6391410 --- /dev/null +++ b/src/flowmapper/utils/flow_names.py @@ -0,0 +1,31 @@ +"""Flow name processing utility functions.""" + +from __future__ import annotations + +import re +from typing import TYPE_CHECKING + +import structlog + +if TYPE_CHECKING: + from flowmapper.domain.flow import Flow + +logger = structlog.get_logger("flowmapper") + +unit_slash = re.compile(r"/(?Pm3|kg)(\,?\s+|\s+|$)") + + +def remove_unit_slash(obj: Flow) -> str: + """Remove unit references from flow names that appear as '/unit' suffix.""" + name = obj.name.data + if match := unit_slash.search(name): + obj_dict = match.groupdict() + if match.end() == len(name): + name = name[: match.start()] + else: + name = name[: match.start()] + ", " + name[match.end() :] + if not obj.unit.compatible(obj_dict["unit"]): + logger.warning( + f"Flow {obj} has unit '{obj.unit}' but name refers to incompatible unit '{obj_dict['unit']}'" + ) + return name diff --git a/src/flowmapper/utils/randonneur.py b/src/flowmapper/utils/randonneur.py new file mode 100644 index 0000000..30ee56d --- /dev/null +++ b/src/flowmapper/utils/randonneur.py @@ -0,0 +1,148 @@ +"""Randonneur-based transformation utility functions.""" + +from __future__ import annotations + +import copy +from collections.abc import Callable +from functools import partial +from typing import TYPE_CHECKING + +from randonneur import Datapackage, MigrationConfig, migrate_nodes +from randonneur_data import Registry + +from flowmapper.utils.constants import default_registry +from flowmapper.utils.context import tupleize_context + +if TYPE_CHECKING: + from flowmapper.domain.flow import Flow + from flowmapper.domain.normalized_flow import NormalizedFlow + + +def randonneur_as_function( + datapackage: str | Datapackage | dict, + fields: list[str] | None = None, + registry: Registry | None = None, + verbs: list[str] | None = None, +) -> Callable: + """Take a prepared transformation in""" + if registry is None: + registry = default_registry + if verbs is None: + verbs = ["update"] + + if isinstance(datapackage, Datapackage): + datapackage = datapackage.data + elif isinstance(datapackage, str): + datapackage = registry.get_file(datapackage) + elif "update" not in datapackage: + raise KeyError + + return partial( + migrate_nodes, + migrations=tupleize_context(datapackage), + config=MigrationConfig( + verbs=verbs, + case_sensitive=( + False + if "case-insensitive" not in datapackage + else not datapackage.get("case-insensitive") + ), + fields=fields, + add_conversion_factor_to_nodes=True, + ), + ) + + +def apply_randonneur( + flows: list[NormalizedFlow], + datapackage: str | Datapackage | dict, + fields: list[str] | None = None, + registry: Registry | None = None, + normalize: bool = False, +) -> list[NormalizedFlow]: + """Apply randonneur transformations to NormalizedFlow objects.""" + from flowmapper.domain.flow import Flow + + func = randonneur_as_function( + datapackage=datapackage, fields=fields, registry=registry + ) + transformed_data = func(graph=[nf.normalized.to_dict() for nf in flows]) + + for flow, data_dict in zip(flows, transformed_data): + if normalize: + flow.current = Flow.from_dict(data_dict).normalize() + else: + flow.current = Flow.from_dict(data_dict) + + return flows + + +def apply_transformation_and_convert_flows_to_normalized_flows( + functions: list[Callable[..., list[NormalizedFlow]]], flows: list[Flow] +) -> list[NormalizedFlow]: + """ + Apply a series of transformation functions to flows and return NormalizedFlow objects. + + This function takes a list of Flow objects and applies a sequence of transformation + functions to them. Each transformation function receives the flow data as dictionaries + (via the `graph` keyword argument) and returns modified dictionaries. The transformations + are applied sequentially, with each function receiving the output of the previous one. + + After all transformations are applied, the modified flow dictionaries are converted back + to Flow objects, normalized, and wrapped in NormalizedFlow objects. The original Flow + objects are preserved and stored in the `original` attribute of each NormalizedFlow. + + Parameters + ---------- + functions : list[Callable[..., list[dict]]] + List of transformation functions to apply sequentially. Each function must accept + a `graph` keyword argument containing a list of flow dictionaries and return a + list of modified flow dictionaries. Functions are typically created using + `randonneur_as_function()`. + flows : list[Flow] + List of Flow objects to transform. The original Flow objects are not modified. + + Returns + ------- + list[NormalizedFlow] + List of NormalizedFlow objects, one for each input flow. Each NormalizedFlow contains: + - `original`: The original Flow object (unchanged) + - `normalized`: The transformed and normalized Flow object + - `current`: A copy of the normalized Flow object + + Examples + -------- + >>> from flowmapper.domain.flow import Flow + >>> from flowmapper.utils import apply_transformation_and_convert_flows_to_normalized_flows, randonneur_as_function + >>> + >>> # Create a transformation function + >>> transform_func = randonneur_as_function(datapackage="some-transformation") + >>> + >>> # Create flows + >>> flows = [ + ... Flow.from_dict({"name": "Carbon dioxide", "context": "air", "unit": "kg"}) + ... ] + >>> + >>> # Apply transformations + >>> normalized_flows = apply_transformation_and_convert_flows_to_normalized_flows( + ... functions=[transform_func], + ... flows=flows + ... ) + >>> + >>> # Access transformed data + >>> print(normalized_flows[0].normalized.name.data) + """ + from flowmapper.domain.flow import Flow + from flowmapper.domain.normalized_flow import NormalizedFlow + + flow_dicts = [obj.to_dict() for obj in flows] + + for function in functions: + flow_dicts = function(graph=flow_dicts) + + normalized_flows = [Flow.from_dict(obj).normalize() for obj in flow_dicts] + + return [ + NormalizedFlow(original=o, normalized=n, current=copy.copy(n)) + for o, n in zip(flows, normalized_flows) + ] diff --git a/src/flowmapper/utils/strings.py b/src/flowmapper/utils/strings.py new file mode 100644 index 0000000..da25486 --- /dev/null +++ b/src/flowmapper/utils/strings.py @@ -0,0 +1,25 @@ +"""String manipulation utility functions.""" + +import unicodedata +from collections.abc import Collection, Mapping +from typing import Any + + +def normalize_str(s: Any) -> str: + """Normalize a string using Unicode NFC normalization and strip whitespace.""" + if s is not None: + return unicodedata.normalize("NFC", s).strip() + else: + return "" + + +def rowercase(obj: Any) -> Any: + """Recursively transform everything to lower case recursively.""" + if isinstance(obj, str): + return obj.lower() + elif isinstance(obj, Mapping): + return type(obj)([(rowercase(k), rowercase(v)) for k, v in obj.items()]) + elif isinstance(obj, Collection): + return type(obj)([rowercase(o) for o in obj]) + else: + return obj diff --git a/tests/conftest.py b/tests/conftest.py index 4e71bfc..59717d4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,16 +1 @@ """Fixtures for flowmapper""" - -import pytest - -from flowmapper.flow import Flow -from flowmapper.transformation_mapping import prepare_transformations -from flowmapper.utils import ( - apply_transformations, - load_standard_transformations, - read_migration_files, -) - - -@pytest.fixture -def transformations(): - return prepare_transformations(load_standard_transformations()) diff --git a/tests/data/ei-3.10.json b/tests/data/ei-3.10.json index d9bc5c9..b62837e 100644 --- a/tests/data/ei-3.10.json +++ b/tests/data/ei-3.10.json @@ -1,9 +1,9 @@ [ { "identifier": "b6b4201e-0561-5992-912f-e729fbf04e41", - "CAS number": "002008-39-1", + "cas_number": "002008-39-1", "name": "2,4-D dimethylamine salt", "unit": "kg", "context": ["air", "non-urban air or from high stacks"] } -] \ No newline at end of file +] diff --git a/tests/data/ei-3.7.json b/tests/data/ei-3.7.json index f338d95..dec80c2 100644 --- a/tests/data/ei-3.7.json +++ b/tests/data/ei-3.7.json @@ -1,7 +1,7 @@ [ { "identifier": "09db39be-d9a6-4fc3-8d25-1f80b23e9131", - "CAS number": "000110-63-4", + "cas_number": "000110-63-4", "name": "1,4-Butanediol", "unit": "kg", "context": ["air", "unspecified"], @@ -9,9 +9,9 @@ }, { "identifier": "0f440cc0-0f74-446d-99d6-8ff0e97a2444", - "CAS number": "007664-41-7", + "cas_number": "007664-41-7", "name": "Ammonia", "unit": "kg", "context": ["air", "non-urban air or from high stacks"] } -] \ No newline at end of file +] diff --git a/tests/data/ei-3.9.json b/tests/data/ei-3.9.json index d07c8a7..6d2c1e1 100644 --- a/tests/data/ei-3.9.json +++ b/tests/data/ei-3.9.json @@ -2,7 +2,7 @@ { "identifier": "4f777e05-70f9-4a18-a406-d8232325073f", "formula": "C10H13Cl2NO3", - "CAS number": "002008-39-1", + "cas_number": "002008-39-1", "name": "2,4-D amines", "unit": "kg", "context": ["air", "non-urban air or from high stacks"], @@ -12,4 +12,4 @@ "N-methylmethanamine" ] } -] \ No newline at end of file +] diff --git a/tests/data/sp.json b/tests/data/sp.json index d042aaa..4a9bef8 100644 --- a/tests/data/sp.json +++ b/tests/data/sp.json @@ -3,19 +3,19 @@ "name": "1,4-Butanediol", "context": "air", "unit": "kg", - "CAS number": "000110-63-4" + "cas_number": "000110-63-4" }, { "name": "1,4-Butanediol", "context": "air", "unit": "kg", - "CAS number": "000110-63-4" + "cas_number": "000110-63-4" }, { "name": "1,4-Butanediol", "context": "air/high. pop.", "unit": "kg", - "CAS number": "000110-63-4" + "cas_number": "000110-63-4" }, { "name": "Cesium-134", diff --git a/tests/test_cas.py b/tests/test_cas.py deleted file mode 100644 index 5b5a468..0000000 --- a/tests/test_cas.py +++ /dev/null @@ -1,61 +0,0 @@ -import pytest - -from flowmapper.cas import CASField - - -def test_cas_init(): - cas = CASField("0000096-49-1") - assert cas.original == "0000096-49-1" - assert cas.transformed == "96-49-1" - assert cas.digits == (9, 6, 4, 9, 1) - - -def test_cas_init_empty_string(): - cas = CASField("") - assert cas.original == "" - assert cas.transformed == "" - assert cas.digits == () - - -def test_cas_init_none(): - cas = CASField(None) - assert cas.original is None - assert cas.transformed == "" - assert cas.digits == () - - -def test_cas_init_error(): - with pytest.raises(TypeError): - CASField(96491) - - -def test_cas_export(): - assert CASField("7782-40-3").export == "7782-40-3" - assert CASField("7782403").export == "7782-40-3" - assert CASField("0007782403").export == "7782-40-3" - assert CASField("").export == "" - assert CASField(None).export == "" - - -def test_invalid_cas_check_digit(): - assert not CASField("96-49-2").valid - assert CASField("96-49-2").check_digit_expected == 1 - - -def test_cas_repr(): - repr(CASField("0000096-49-1")) == "Valid CASField: '0000096-49-1' -> '96-49-1'" - repr(CASField("0000096-49-2")) == "Invalid CASField: '0000096-49-2' -> '96-49-2'" - repr(CASField("")) == "CASField with missing original value" - - -def test_equality_comparison(): - assert CASField("\t\n\n007440-05-3") == CASField("7440-05-3") - assert CASField("7440-05-3") == "0007440-05-3" - assert CASField("7440-05-3") == "7440-05-3" - assert not CASField("7440-05-3") == "7782-40-3" - assert not CASField("7440-05-3") == CASField("7782-40-3") - assert not CASField("") == CASField("7782-40-3") - assert not CASField("7440-05-3") == CASField("") - assert not CASField("") == CASField("") - assert not CASField(None) == CASField("") - assert not CASField("") == CASField(None) diff --git a/tests/test_cli.py b/tests/test_cli.py deleted file mode 100644 index 305b05c..0000000 --- a/tests/test_cli.py +++ /dev/null @@ -1,175 +0,0 @@ -import json - -import pandas as pd -import pytest -from typer.testing import CliRunner - -from flowmapper.cli import app - -runner = CliRunner() - - -def test_version(): - result = runner.invoke(app, ["--version"]) - assert result.output.startswith("flowmapper, version") - - -def test_format_glad(tmp_path): - result = runner.invoke( - app, - [ - "map", - "tests/data/sp.json", - "tests/data/ei-3.7.json", - "--format", - "glad", - "--output-dir", - str(tmp_path), - ], - ) - expected_files = sorted( - [ - tmp_path / "sp-ei-3.7.xlsx", - tmp_path / "sp-ei-3.7-unmatched-source.json", - tmp_path / "sp-ei-3.7-unmatched-target.json", - ] - ) - - files = sorted(tmp_path.glob("**/*")) - - assert result.exit_code == 0 - assert expected_files == files - - -def test_format_randonneur(tmp_path): - result = runner.invoke( - app, - [ - "map", - "tests/data/sp.json", - "tests/data/ei-3.7.json", - "--format", - "randonneur", - "--output-dir", - str(tmp_path), - ], - ) - expected_files = sorted( - [ - tmp_path / "sp-ei-3.7.json", - tmp_path / "sp-ei-3.7-unmatched-source.json", - tmp_path / "sp-ei-3.7-unmatched-target.json", - ] - ) - - files = sorted(tmp_path.glob("**/*")) - - assert result.exit_code == 0 - assert expected_files == files - - -def test_matched_flows(tmp_path): - runner.invoke( - app, - [ - "map", - "tests/data/sp.json", - "tests/data/ei-3.7.json", - "--matched-source", - "--matched-target", - "--output-dir", - str(tmp_path), - ], - ) - - with open(tmp_path / "sp-ei-3.7-matched-source.json") as fs: - actual = json.load(fs) - - expected = [ - { - "CAS number": "110-63-4", - "context": "air", - "name": "1,4-Butanediol", - "unit": "kg", - }, - {"context": "air/low. pop.", "name": "Ammonia, FR", "unit": "kg"}, - ] - assert actual == expected - - -def test_matched_flows_with_randonneur_transformations(tmp_path): - runner.invoke( - app, - [ - "map", - "tests/data/sp.json", - "tests/data/ei-3.7.json", - "--transformations", - "tests/data/transformations.json", - "--matched-source", - "--matched-target", - "--output-dir", - str(tmp_path), - ], - ) - - with open(tmp_path / "sp-ei-3.7-matched-source.json") as fs: - actual = json.load(fs) - - expected = [ - { - "CAS number": "110-63-4", - "context": "air", - "name": "1,4-Butanediol", - "unit": "kg", - }, - { - "CAS number": "110-63-4", - "context": "air/high. pop.", - "name": "1,4-Butanediol", - "unit": "kg", - }, - {"context": "air/low. pop.", "name": "Ammonia, FR", "unit": "kg"}, - {"context": "air/low. pop.", "name": "Ammonia, as N", "unit": "kg"}, - ] - assert actual == expected - - -def test_matched_flows_with_multiple_randonneur_transformations(tmp_path): - runner.invoke( - app, - [ - "map", - "tests/data/sp.json", - "tests/data/ei-3.7.json", - "--transformations", - "tests/data/transformations.json", - "--transformations", - "tests/data/migrations.json", - "--matched-source", - "--matched-target", - "--output-dir", - str(tmp_path), - ], - ) - - with open(tmp_path / "sp-ei-3.7-matched-source.json") as fs: - actual = json.load(fs) - - expected = [ - { - "name": "1,4-Butanediol", - "unit": "kg", - "context": "air", - "CAS number": "110-63-4", - }, - { - "name": "1,4-Butanediol", - "unit": "kg", - "context": "air/high. pop.", - "CAS number": "110-63-4", - }, - {"name": "Ammonia, FR", "unit": "kg", "context": "air/low. pop."}, - {"name": "Ammonia, as N", "unit": "kg", "context": "air/low. pop."}, - ] - assert actual == expected diff --git a/tests/test_context.py b/tests/test_context.py deleted file mode 100644 index e7eb83a..0000000 --- a/tests/test_context.py +++ /dev/null @@ -1,124 +0,0 @@ -import pytest - -from flowmapper.context import MISSING_VALUES, ContextField - - -def test_context_uses_transformed(): - c = ContextField( - original="Raw/(unspecified)", - transformed=["Raw", "(unspecified)"], - ) - assert c == ["Raw", "(unspecified)"] - assert c.transformed == ["Raw", "(unspecified)"] - - -def test_context_transformed_from_tuple(): - c = ContextField( - original="Raw/(unspecified)", - transformed=("Raw", "(unspecified)"), - ) - assert c == ["Raw", "(unspecified)"] - assert c.transformed == ("Raw", "(unspecified)") - - -def test_context_transformed_from_string_with_slash(): - c = ContextField( - original="Raw/(unspecified)", - transformed="Raw/(unspecified)", - ) - assert c == ["Raw", "(unspecified)"] - assert c.transformed == "Raw/(unspecified)" - - -def test_context_transformed_from_string(): - c = ContextField( - original="Raw/(unspecified)", - transformed="Raw", - ) - assert c == ["Raw", "(unspecified)"] - assert c.transformed == "Raw" - - -def test_context_transformed_not_given(): - c = ContextField( - original="Raw/(unspecified)", - ) - assert c == ["Raw", "(unspecified)"] - assert c.transformed == "Raw/(unspecified)" - - -def test_context_normalize_tuple(): - c = ContextField( - original=("Raw",), - ) - assert c.normalized == ("raw",) - - -def test_context_normalize_string_with_slash(): - c = ContextField( - original="A/B", - ) - assert c.normalized == ("a", "b") - - -def test_context_normalize_string(): - c = ContextField( - original="A-B", - ) - assert c.normalized == ("a-b",) - - -def test_context_normalize_error(): - class Foo: - pass - - with pytest.raises(ValueError): - ContextField(Foo()) - - -def test_context_normalize_lowercase(): - c = ContextField( - original="A-B", - ) - assert c.normalized == ("a-b",) - - -def test_context_normalize_strip(): - c = ContextField( - original=" A-B\t\n", - ) - assert c.normalized == ("a-b",) - - -@pytest.mark.parametrize("string", MISSING_VALUES) -def test_context_missing_values(string): - c = ContextField( - original=("A", string), - ) - assert c.original == ("A", string) - assert c.normalized == ("a",) - - -def test_context_generic_dunder(): - c = ContextField("A/B") - assert repr(c) == "ContextField: 'A/B' -> '('a', 'b')'" - assert repr(ContextField("")) == "ContextField: '' -> '()'" - assert bool(c) - assert isinstance(hash(c), int) - assert list(c) == ["a", "b"] - - -def test_context_in(): - a = ContextField("A") - b = ContextField("A/B") - assert b in a - assert a not in b - - -def test_context_export_as_string(): - assert ContextField(["A", "B"]).export_as_string() == "A✂️B" - assert ContextField("A/B").export_as_string() == "A/B" - c = ContextField("A/B") - c.original = {"A": "B"} - with pytest.raises(ValueError): - c.export_as_string() diff --git a/tests/test_extract_ecospold2.py b/tests/test_extract_ecospold2.py index d3d7851..e88bc4a 100644 --- a/tests/test_extract_ecospold2.py +++ b/tests/test_extract_ecospold2.py @@ -10,8 +10,14 @@ def test_remove_conflicting_synonyms_no_conflicts(): result = remove_conflicting_synonyms(data) - assert result[0]["synonyms"] == ["water", "h2o"] - assert result[1]["synonyms"] == ["soil", "earth"] + assert result[0]["synonyms"] == [ + "water", + "h2o", + ], f"Expected result[0]['synonyms'] to equal ['water', 'h2o'], but got {result[0]['synonyms']}" + assert result[1]["synonyms"] == [ + "soil", + "earth", + ], f"Expected result[1]['synonyms'] to equal ['soil', 'earth'], but got {result[1]['synonyms']}" def test_remove_conflicting_synonyms_with_conflicts(): @@ -28,8 +34,12 @@ def test_remove_conflicting_synonyms_with_conflicts(): result = remove_conflicting_synonyms(data) # "water" should be removed from flow_a's synonyms - assert result[0]["synonyms"] == ["h2o"] - assert result[1]["synonyms"] == ["aqua"] + assert result[0]["synonyms"] == [ + "h2o" + ], f"Expected result[0]['synonyms'] to equal ['h2o'], but got {result[0]['synonyms']}" + assert result[1]["synonyms"] == [ + "aqua" + ], f"Expected result[1]['synonyms'] to equal ['aqua'], but got {result[1]['synonyms']}" def test_remove_conflicting_synonyms_different_contexts(): @@ -46,8 +56,13 @@ def test_remove_conflicting_synonyms_different_contexts(): result = remove_conflicting_synonyms(data) # "water" should be kept since contexts are different - assert result[0]["synonyms"] == ["water", "h2o"] - assert result[1]["synonyms"] == ["aqua"] + assert result[0]["synonyms"] == [ + "water", + "h2o", + ], f"Expected result[0]['synonyms'] to equal ['water', 'h2o'], but got {result[0]['synonyms']}" + assert result[1]["synonyms"] == [ + "aqua" + ], f"Expected result[1]['synonyms'] to equal ['aqua'], but got {result[1]['synonyms']}" def test_remove_conflicting_synonyms_multiple_conflicts(): @@ -61,9 +76,15 @@ def test_remove_conflicting_synonyms_multiple_conflicts(): result = remove_conflicting_synonyms(data) # Both "water" and "soil" should be removed from flow_a's synonyms - assert result[0]["synonyms"] == ["h2o"] - assert result[1]["synonyms"] == ["aqua"] - assert result[2]["synonyms"] == ["earth"] + assert result[0]["synonyms"] == [ + "h2o" + ], f"Expected result[0]['synonyms'] to equal ['h2o'], but got {result[0]['synonyms']}" + assert result[1]["synonyms"] == [ + "aqua" + ], f"Expected result[1]['synonyms'] to equal ['aqua'], but got {result[1]['synonyms']}" + assert result[2]["synonyms"] == [ + "earth" + ], f"Expected result[2]['synonyms'] to equal ['earth'], but got {result[2]['synonyms']}" def test_remove_conflicting_synonyms_no_synonyms(): @@ -80,8 +101,12 @@ def test_remove_conflicting_synonyms_no_synonyms(): result = remove_conflicting_synonyms(data) # Should not raise error and flow_b should keep its synonym - assert "synonyms" not in result[0] - assert result[1]["synonyms"] == ["water"] + assert ( + "synonyms" not in result[0] + ), "Expected 'synonyms' to not be in result[0], but it was" + assert result[1]["synonyms"] == [ + "water" + ], f"Expected result[1]['synonyms'] to equal ['water'], but got {result[1]['synonyms']}" def test_remove_conflicting_synonyms_no_context(): @@ -98,8 +123,13 @@ def test_remove_conflicting_synonyms_no_context(): result = remove_conflicting_synonyms(data) # flow_a should keep its synonyms since it has no context - assert result[0]["synonyms"] == ["water", "h2o"] - assert result[1]["synonyms"] == ["aqua"] + assert result[0]["synonyms"] == [ + "water", + "h2o", + ], f"Expected result[0]['synonyms'] to equal ['water', 'h2o'], but got {result[0]['synonyms']}" + assert result[1]["synonyms"] == [ + "aqua" + ], f"Expected result[1]['synonyms'] to equal ['aqua'], but got {result[1]['synonyms']}" def test_remove_conflicting_synonyms_empty_synonyms_list(): @@ -112,8 +142,12 @@ def test_remove_conflicting_synonyms_empty_synonyms_list(): result = remove_conflicting_synonyms(data) # Empty synonyms list should remain empty - assert result[0]["synonyms"] == [] - assert result[1]["synonyms"] == ["aqua"] + assert ( + result[0]["synonyms"] == [] + ), f"Expected result[0]['synonyms'] to equal [], but got {result[0]['synonyms']}" + assert result[1]["synonyms"] == [ + "aqua" + ], f"Expected result[1]['synonyms'] to equal ['aqua'], but got {result[1]['synonyms']}" def test_remove_conflicting_synonyms_case_insensitive(): @@ -129,8 +163,12 @@ def test_remove_conflicting_synonyms_case_insensitive(): result = remove_conflicting_synonyms(data) - assert result[0]["synonyms"] == ["H2O"] - assert result[1]["synonyms"] == ["aqua"] + assert result[0]["synonyms"] == [ + "H2O" + ], f"Expected result[0]['synonyms'] to equal ['H2O'], but got {result[0]['synonyms']}" + assert result[1]["synonyms"] == [ + "aqua" + ], f"Expected result[1]['synonyms'] to equal ['aqua'], but got {result[1]['synonyms']}" def test_remove_conflicting_synonyms_self_conflict(): @@ -142,7 +180,10 @@ def test_remove_conflicting_synonyms_self_conflict(): result = remove_conflicting_synonyms(data) # All synonyms should be kept since they don't conflict with other flows - assert result[0]["synonyms"] == ["h2o", "aqua"] + assert result[0]["synonyms"] == [ + "h2o", + "aqua", + ], f"Expected result[0]['synonyms'] to equal ['h2o', 'aqua'], but got {result[0]['synonyms']}" def test_remove_conflicting_synonyms_preserves_original_data(): @@ -167,14 +208,34 @@ def test_remove_conflicting_synonyms_preserves_original_data(): result = remove_conflicting_synonyms(data) # Check that other fields are preserved - assert result[0]["name"] == "flow_a" - assert result[0]["context"] == ["ground"] - assert result[0]["unit"] == "kg" - assert result[0]["identifier"] == "123" - assert result[0]["synonyms"] == ["h2o"] # Only "water" removed - - assert result[1]["name"] == "water" - assert result[1]["context"] == ["ground"] - assert result[1]["unit"] == "m3" - assert result[1]["identifier"] == "456" - assert result[1]["synonyms"] == ["aqua"] + assert ( + result[0]["name"] == "flow_a" + ), f"Expected result[0]['name'] to equal 'flow_a', but got {result[0]['name']!r}" + assert result[0]["context"] == [ + "ground" + ], f"Expected result[0]['context'] to equal ['ground'], but got {result[0]['context']}" + assert ( + result[0]["unit"] == "kg" + ), f"Expected result[0]['unit'] to equal 'kg', but got {result[0]['unit']!r}" + assert ( + result[0]["identifier"] == "123" + ), f"Expected result[0]['identifier'] to equal '123', but got {result[0]['identifier']!r}" + assert result[0]["synonyms"] == [ + "h2o" + ], f"Expected result[0]['synonyms'] to equal ['h2o'], but got {result[0]['synonyms']}" # Only "water" removed + + assert ( + result[1]["name"] == "water" + ), f"Expected result[1]['name'] to equal 'water', but got {result[1]['name']!r}" + assert result[1]["context"] == [ + "ground" + ], f"Expected result[1]['context'] to equal ['ground'], but got {result[1]['context']}" + assert ( + result[1]["unit"] == "m3" + ), f"Expected result[1]['unit'] to equal 'm3', but got {result[1]['unit']!r}" + assert ( + result[1]["identifier"] == "456" + ), f"Expected result[1]['identifier'] to equal '456', but got {result[1]['identifier']!r}" + assert result[1]["synonyms"] == [ + "aqua" + ], f"Expected result[1]['synonyms'] to equal ['aqua'], but got {result[1]['synonyms']}" diff --git a/tests/test_flow.py b/tests/test_flow.py deleted file mode 100644 index 4e42c96..0000000 --- a/tests/test_flow.py +++ /dev/null @@ -1,135 +0,0 @@ -from flowmapper.cas import CASField -from flowmapper.flow import Flow -from flowmapper.transformation_mapping import prepare_transformations - - -def test_flow_with_transformations_repr(): - d = { - "name": "Carbon dioxide, in air", - "context": ["Raw", "(unspecified)"], - "unit": "kg", - "cas": "000124-38-9", - } - - transformations = prepare_transformations( - [ - { - "update": [ - { - "source": { - "name": "Carbon dioxide, in air", - "context": ["Raw", "(unspecified)"], - }, - "target": {"name": "Carbon dioxide"}, - } - ] - } - ] - ) - - f = Flow(d, transformations=transformations) - expected = """Flow object: - Identifier: StringField with missing original value - Name: StringField: 'Carbon dioxide, in air' -> 'carbon dioxide' - Context: ContextField: '['Raw', '(unspecified)']' -> '('raw',)' - Unit: UnitField: 'kg' -> 'kg'""" - - assert repr(f) == expected - - -def test_flow_from_sp_categories(transformations): - data = { - "name": "Carbon dioxide, in air", - "context": "resources/in air", - "unit": "kg", - "CAS number": "000124-38-9", - } - - flow = Flow(data, transformations) - assert not flow.identifier - assert flow.name.original == "Carbon dioxide, in air" - assert flow.name.normalized == "carbon dioxide, in air" - assert flow.context.original == "resources/in air" - assert flow.context.normalized == ("natural resource", "in air") - - -def test_flow_from_sp_missing(transformations): - data = {"name": "Chrysotile", "context": "Raw/in ground", "unit": "kg"} - - flow = Flow(data, transformations) - assert flow.name.original == "Chrysotile" - expected = """Flow object: - Identifier: StringField with missing original value - Name: StringField: 'Chrysotile' -> 'chrysotile' - Context: ContextField: 'Raw/in ground' -> '('natural resource', 'in ground')' - Unit: UnitField: 'kg' -> 'kilogram'""" - assert repr(flow) == expected - assert flow.context.original == "Raw/in ground" - assert flow.context.normalized == ("natural resource", "in ground") - - -def test_flow_cas(): - data = { - "name": "Actinium", - "CAS number": "007440-34-8", - "chemical formula": "Ac\u007f", - "synonyms": "Actinium", - "unit": "kg", - "Class": "Raw materials", - "context": "Raw materials", - "Description": "", - } - - fields = { - "identifier": "Flow UUID", - "name": "name", - "context": "context", - "unit": "unit", - "CAS number": "CAS No", - } - - flow = Flow(data) - assert flow.cas == CASField("007440-34-8") - assert flow.cas == "7440-34-8" - - -def test_flow_from_ei(): - data = { - "name": "1,3-Dioxolan-2-one", - "CAS number": "000096-49-1", - "chemical formula": "", - "synonyms": "", - "unit": "kg", - "Class": "chemical", - "ExternalReference": "", - "Preferred": "", - "context": "water/unspecified", - "identifier": "5b7d620e-2238-5ec9-888a-6999218b6974", - "AltUnit": "", - "Var": "", - "Second CAS": "96-49-1", - } - flow = Flow(data) - assert flow.identifier == "5b7d620e-2238-5ec9-888a-6999218b6974" - - -def test_flow_with_synonyms(transformations): - data = { - "identifier": "f0cc0453-32c0-48f5-b8d4-fc87d100b8d9", - "CAS number": "000078-79-5", - "name": "Isoprene", - "unit": "kg", - "context": ["air", "low population density, long-term"], - "synonyms": [ - "2-methylbuta-1,3-diene", - "methyl bivinyl", - "hemiterpene", - ], - } - - flow = Flow(data, transformations) - assert [obj.original for obj in flow.synonyms] == [ - "2-methylbuta-1,3-diene", - "methyl bivinyl", - "hemiterpene", - ] diff --git a/tests/test_flowmap.py b/tests/test_flowmap.py deleted file mode 100644 index da3da92..0000000 --- a/tests/test_flowmap.py +++ /dev/null @@ -1,322 +0,0 @@ -import json -from pathlib import Path - -import pandas as pd -import pytest - -from flowmapper import Flow, Flowmap -from flowmapper.match import match_emissions_with_suffix_ion, match_identical_names - -DATA_DIR = Path(__file__).parent / "data" - - -@pytest.fixture -def source_flows(transformations): - return [ - Flow(flow, transformations) for flow in json.load(open(DATA_DIR / "sp.json")) - ] - - -@pytest.fixture -def target_flows(transformations): - return [ - Flow(flow, transformations) - for flow in json.load(open(DATA_DIR / "ei-3.7.json")) - ] - - -@pytest.fixture -def ei39(): - return [Flow(flow) for flow in json.load(open(DATA_DIR / "ei-3.9.json"))] - - -@pytest.fixture -def ei310(): - return [Flow(flow) for flow in json.load(open(DATA_DIR / "ei-3.10.json"))] - - -def test_flowmap_remove_duplicates(source_flows, target_flows): - flowmap = Flowmap(source_flows, target_flows) - actual = flowmap.source_flows - # Added one duplicate on purpose - assert len(flowmap.source_flows) == 7 - - -def test_flowmap_mappings(source_flows, target_flows): - flowmap = Flowmap(source_flows, target_flows) - actual = flowmap.mappings[0] - assert list(actual.keys()) == [ - "from", - "to", - "conversion_factor", - "match_rule", - "match_rule_priority", - "info", - ] - assert actual["match_rule"] == "match_identical_names" - - -def test_flowmap_to_randonneur(source_flows, target_flows): - flowmap = Flowmap(source_flows, target_flows) - actual = flowmap.to_randonneur() - expected = [ - { - "comment": "Identical names", - "conversion_factor": 1.0, - "source": { - "CAS number": "110-63-4", - "context": "air", - "name": "1,4-Butanediol", - "unit": "kg", - }, - "target": { - "CAS number": "110-63-4", - "context": ["air", "unspecified"], - "identifier": "09db39be-d9a6-4fc3-8d25-1f80b23e9131", - "name": "1,4-Butanediol", - "unit": "kg", - }, - }, - { - "comment": "Name matching with location code", - "conversion_factor": 1.0, - "source": {"context": "air/low. pop.", "name": "Ammonia, FR", "unit": "kg"}, - "target": { - "CAS number": "7664-41-7", - "context": ["air", "non-urban air or from high stacks"], - "identifier": "0f440cc0-0f74-446d-99d6-8ff0e97a2444", - "location": "FR", - "name": "Ammonia", - "unit": "kg", - }, - }, - ] - assert actual == expected - - -def test_flowmap_to_randonneur_export(source_flows, target_flows, tmp_path): - flowmap = Flowmap(source_flows, target_flows) - flowmap.to_randonneur(tmp_path / "randonneur.json") - with open(tmp_path / "randonneur.json", "r") as fs: - actual = json.load(fs) - expected = [ - { - "comment": "Identical names", - "conversion_factor": 1.0, - "source": { - "CAS number": "110-63-4", - "context": "air", - "name": "1,4-Butanediol", - "unit": "kg", - }, - "target": { - "CAS number": "110-63-4", - "context": ["air", "unspecified"], - "identifier": "09db39be-d9a6-4fc3-8d25-1f80b23e9131", - "name": "1,4-Butanediol", - "unit": "kg", - }, - }, - { - "comment": "Name matching with location code", - "conversion_factor": 1.0, - "source": {"context": "air/low. pop.", "name": "Ammonia, FR", "unit": "kg"}, - "target": { - "CAS number": "7664-41-7", - "context": ["air", "non-urban air or from high stacks"], - "identifier": "0f440cc0-0f74-446d-99d6-8ff0e97a2444", - "location": "FR", - "name": "Ammonia", - "unit": "kg", - }, - }, - ] - assert actual == expected - - -def test_flowmap_with_custom_rules_no_match(source_flows, target_flows): - flowmap = Flowmap( - source_flows, - target_flows, - rules=[match_emissions_with_suffix_ion], - ) - actual = flowmap.mappings - assert actual == [] - - -def test_flowmap_with_custom_rules_match(source_flows, target_flows): - flowmap = Flowmap(source_flows, target_flows, rules=[match_identical_names]) - actual = flowmap.to_randonneur() - expected = [ - { - "comment": "Identical names", - "conversion_factor": 1.0, - "source": { - "CAS number": "110-63-4", - "context": "air", - "name": "1,4-Butanediol", - "unit": "kg", - }, - "target": { - "CAS number": "110-63-4", - "context": [ - "air", - "unspecified", - ], - "identifier": "09db39be-d9a6-4fc3-8d25-1f80b23e9131", - "name": "1,4-Butanediol", - "unit": "kg", - }, - } - ] - assert actual == expected - - -def test_flowmap_to_glad(source_flows, target_flows): - flowmap = Flowmap(source_flows, target_flows) - actual = flowmap.to_glad() - expected = { - "SourceFlowName": ["1,4-Butanediol", "Ammonia, FR"], - "SourceFlowUUID": ["", ""], - "SourceFlowContext": ["air", "air/low. pop."], - "SourceUnit": ["kg", "kg"], - "MatchCondition": ["=", "="], - "ConversionFactor": [1.0, 1.0], - "TargetFlowName": ["1,4-Butanediol", "Ammonia"], - "TargetFlowUUID": [ - "09db39be-d9a6-4fc3-8d25-1f80b23e9131", - "0f440cc0-0f74-446d-99d6-8ff0e97a2444", - ], - "TargetFlowContext": [ - "air✂️unspecified", - "air✂️non-urban air or from high stacks", - ], - "TargetUnit": ["kg", "kg"], - "MemoMapper": ["Identical names", "Name matching with location code"], - } - pd.testing.assert_frame_equal(actual, pd.DataFrame(expected)) - - -def test_flowmap_to_glad_export(source_flows, target_flows, tmp_path): - flowmap = Flowmap(source_flows, target_flows) - flowmap.to_glad(tmp_path / "glad.xlsx") - actual = pd.read_excel(tmp_path / "glad.xlsx") - print(actual["MatchCondition"]) - expected = { - "SourceFlowName": ["1,4-Butanediol", "Ammonia, FR"], - "SourceFlowUUID": [float("NaN"), float("NaN")], - "SourceFlowContext": ["air", "air/low. pop."], - "SourceUnit": ["kg", "kg"], - "MatchCondition": ["=", "="], - "ConversionFactor": [1, 1], - "TargetFlowName": ["1,4-Butanediol", "Ammonia"], - "TargetFlowUUID": [ - "09db39be-d9a6-4fc3-8d25-1f80b23e9131", - "0f440cc0-0f74-446d-99d6-8ff0e97a2444", - ], - "TargetFlowContext": [ - "air✂️unspecified", - "air✂️non-urban air or from high stacks", - ], - "TargetUnit": ["kg", "kg"], - "MemoMapper": ["Identical names", "Name matching with location code"], - } - pd.testing.assert_frame_equal(actual, pd.DataFrame(expected)) - - -def test_flowmap_nomatch_rule(source_flows, target_flows): - nomatch = lambda flow: flow.context == "air/urban air close to ground" - flowmap = Flowmap(source_flows, target_flows, nomatch_rules=[nomatch]) - - assert len(flowmap.source_flows_nomatch) == 1 - assert flowmap.source_flows_nomatch[0].name == "1,4-Butanediol" - assert flowmap.source_flows_nomatch[0].context == "air/urban air close to ground" - assert flowmap.source_flows[0].name == "1,4-Butanediol" - assert flowmap.source_flows[0].context == "air" - - -def test_flowmap_nomatch_rule_false(source_flows, target_flows): - nomatch = lambda flow: flow.context == "water" - flowmap = Flowmap(source_flows, target_flows, nomatch_rules=[nomatch]) - assert not flowmap.source_flows_nomatch - - -def test_flowmap_nomatch_multiple_rules(source_flows, target_flows): - nomatch1 = lambda flow: flow.context == "air/urban air close to ground" - nomatch2 = lambda flow: flow.context == "air" - flowmap = Flowmap(source_flows, target_flows, nomatch_rules=[nomatch1, nomatch2]) - - assert len(flowmap.source_flows_nomatch) == 2 - assert flowmap.source_flows_nomatch[0].name == "1,4-Butanediol" - assert flowmap.source_flows_nomatch[1].name == "1,4-Butanediol" - assert flowmap.source_flows[0].name == "Cesium-134" - - -def test_flowmap_mappings_ei_ei(target_flows): - flowmap = Flowmap(target_flows, target_flows) - actual = flowmap.to_randonneur() - expected = [ - { - "source": { - "name": "1,4-Butanediol", - "unit": "kg", - "identifier": "09db39be-d9a6-4fc3-8d25-1f80b23e9131", - "context": ["air", "unspecified"], - "CAS number": "110-63-4", - }, - "target": { - "name": "1,4-Butanediol", - "unit": "kg", - "identifier": "09db39be-d9a6-4fc3-8d25-1f80b23e9131", - "context": ["air", "unspecified"], - "CAS number": "110-63-4", - }, - "conversion_factor": 1.0, - "comment": "Identical identifier", - }, - { - "source": { - "name": "Ammonia", - "unit": "kg", - "identifier": "0f440cc0-0f74-446d-99d6-8ff0e97a2444", - "context": ["air", "non-urban air or from high stacks"], - "CAS number": "7664-41-7", - }, - "target": { - "name": "Ammonia", - "unit": "kg", - "identifier": "0f440cc0-0f74-446d-99d6-8ff0e97a2444", - "context": ["air", "non-urban air or from high stacks"], - "CAS number": "7664-41-7", - }, - "conversion_factor": 1.0, - "comment": "Identical identifier", - }, - ] - assert actual == expected - - -def test_flowmap_mappings_ei39_ei310(ei39, ei310): - flowmap = Flowmap(ei39, ei310) - actual = flowmap.to_randonneur() - expected = [ - { - "source": { - "name": "2,4-D amines", - "unit": "kg", - "identifier": "4f777e05-70f9-4a18-a406-d8232325073f", - "context": ["air", "non-urban air or from high stacks"], - "CAS number": "2008-39-1", - }, - "target": { - "name": "2,4-D dimethylamine salt", - "unit": "kg", - "identifier": "b6b4201e-0561-5992-912f-e729fbf04e41", - "context": ["air", "non-urban air or from high stacks"], - "CAS number": "2008-39-1", - }, - "conversion_factor": 1.0, - "comment": "Identical CAS numbers", - } - ] - assert actual == expected diff --git a/tests/test_format_match_result.py b/tests/test_format_match_result.py deleted file mode 100644 index 535943f..0000000 --- a/tests/test_format_match_result.py +++ /dev/null @@ -1,40 +0,0 @@ -from deepdiff import DeepDiff - -from flowmapper.flow import Flow -from flowmapper.match import format_match_result - - -def test_format_match_result_missing_id(transformations): - source = { - "name": "Carbon dioxide, in air", - "context": "Raw materials", - "unit": "kg", - } - s = Flow(source, transformations) - - target = { - "identifier": "cc6a1abb-b123-4ca6-8f16-38209df609be", - "name": "Carbon dioxide, in air", - "context": "natural resource/in air", - "unit": "kg", - } - t = Flow(target) - - actual = format_match_result(s, t, 1.0, {"is_match": True, "comment": "foo"}) - expected = { - "source": { - "name": "Carbon dioxide, in air", - "context": "Raw materials", - "unit": "kg", - }, - "target": { - "identifier": "cc6a1abb-b123-4ca6-8f16-38209df609be", - "name": "Carbon dioxide, in air", - "context": "natural resource/in air", - "unit": "kg", - }, - "conversion_factor": 1.0, - "comment": "foo", - } - - assert not DeepDiff(actual, expected) diff --git a/tests/test_get_conversion_factor.py b/tests/test_get_conversion_factor.py deleted file mode 100644 index 862f917..0000000 --- a/tests/test_get_conversion_factor.py +++ /dev/null @@ -1,124 +0,0 @@ -import math - -from flowmapper.flow import Flow - - -def test_get_conversion_factor(transformations): - s = Flow( - { - "name": "Protactinium-234", - "unit": "Bq", - "context": ["Emissions to air", "low. pop."], - }, - transformations, - ) - - t = Flow( - { - "identifier": "fb13070e-06f1-4964-832f-a23945b880cc", - "name": "Protactinium-234", - "unit": "kBq", - "context": ["air", "non-urban air or from high stacks"], - }, - transformations, - ) - - actual = s.unit.conversion_factor(t.unit) - expected = 1e-3 - assert actual == expected - - -def test_get_conversion_factor_water(transformations): - s = Flow( - {"name": "Water", "unit": "kg", "context": ["Emissions to water", ""]}, - transformations, - ) - - t = Flow( - { - "identifier": "2404b41a-2eed-4e9d-8ab6-783946fdf5d6", - "CAS number": "007732-18-5", - "name": "Water", - "unit": "m3", - "context": ["water", "unspecified"], - }, - transformations, - ) - - actual = s.unit.conversion_factor(t.unit) - assert math.isnan(actual) - - -def test_get_conversion_factor_m3y(transformations): - s = Flow( - { - "name": "Volume occupied, reservoir", - "unit": "m3y", - "context": ["Resources", "in water"], - }, - transformations, - ) - - t = Flow( - { - "identifier": "9a9d71c7-79f7-42d0-af47-282d22a7cf07", - "name": "Volume occupied, reservoir", - "unit": "m3*year", - "context": ["natural resource", "in water"], - }, - transformations, - ) - - actual = s.unit.conversion_factor(t.unit) - expected = 1 - assert actual == expected - - -def test_get_conversion_factor_m2a(transformations): - s = Flow( - { - "name": "Occupation, annual crop", - "unit": "m2a", - "context": ["Resources", "land"], - }, - transformations, - ) - - t = Flow( - { - "identifier": "c5aafa60-495c-461c-a1d4-b262a34c45b9", - "name": "Occupation, annual crop", - "unit": "m2*year", - "context": ["natural resource", "land"], - }, - transformations, - ) - - actual = s.unit.conversion_factor(t.unit) - expected = 1 - assert actual == expected - - -def test_get_conversion_factor_nan(transformations): - s = Flow( - { - "name": "Radium-226/kg", - "unit": "kg", - "context": ["Emissions to water", ""], - }, - transformations, - ) - - t = Flow( - { - "identifier": "74a0aabb-e11b-4f3b-8921-45e447b33393", - "CAS number": "013982-63-3", - "name": "Radium-226", - "unit": "kBq", - "context": ["water", "ocean"], - }, - transformations, - ) - - actual = s.unit.conversion_factor(t.unit) - assert math.isnan(actual) diff --git a/tests/test_id_generation.py b/tests/test_id_generation.py deleted file mode 100644 index 031aac7..0000000 --- a/tests/test_id_generation.py +++ /dev/null @@ -1,11 +0,0 @@ -from flowmapper.utils import generate_flow_id - - -def test_generate_flow_id(): - flow1 = { - "name": "1,4-Butanediol", - "context": ["Air", "(unspecified)"], - "unit": "kg", - "CAS number": "000110-63-4", - } - assert generate_flow_id(flow1) == "77bb0c932afd7d7eb7ada382c8828b9f" diff --git a/tests/test_match_biogenic_to_non_fossil.py b/tests/test_match_biogenic_to_non_fossil.py deleted file mode 100644 index f0573c4..0000000 --- a/tests/test_match_biogenic_to_non_fossil.py +++ /dev/null @@ -1,11 +0,0 @@ -from flowmapper.flow import Flow -from flowmapper.match import match_biogenic_to_non_fossil - - -def test_match_biogenic_to_non_fossil(): - s = Flow({"name": "Oils, biogenic", "context": "air", "unit": "kg"}) - t = Flow({"name": "Oils, non-fossil", "context": "air", "unit": "kg"}) - - actual = match_biogenic_to_non_fossil(s, t) - expected = {"comment": "Biogenic to non-fossil if no better match"} - assert actual == expected diff --git a/tests/test_match_custom_names_with_location_codes.py b/tests/test_match_custom_names_with_location_codes.py deleted file mode 100644 index aa18668..0000000 --- a/tests/test_match_custom_names_with_location_codes.py +++ /dev/null @@ -1,87 +0,0 @@ -from flowmapper.flow import Flow -from flowmapper.match import match_custom_names_with_location_codes - - -def test_match_custom_names_with_location_codes_extra(): - s = Flow( - { - "name": "Water (ersatz), net cons., irrigation, HU", - "context": "air", - "unit": "kg", - } - ) - t = Flow( - {"name": "water, unspecified natural origin", "context": "air", "unit": "kg"} - ) - - actual = match_custom_names_with_location_codes(s, t) - expected = { - "comment": "Custom names with location code", - "location": "HU", - "irrigation": True, - } - assert actual == expected - - -def test_match_custom_names_with_location_codes_no_extra(): - s = Flow({"name": "Water, well, HU", "context": "air", "unit": "kg"}) - t = Flow({"name": "Water, well, in ground", "context": "air", "unit": "kg"}) - - actual = match_custom_names_with_location_codes(s, t) - expected = {"comment": "Custom names with location code", "location": "HU"} - assert actual == expected - - -def test_match_custom_names_with_location_codes_extra_whitespace_complicated(): - s = Flow( - { - "name": "Water (ersatz), net cons., irrigation, \t RER w/o DE+NL+NO", - "context": "air", - "unit": "kg", - } - ) - t = Flow( - {"name": "water, unspecified natural origin", "context": "air", "unit": "kg"} - ) - - actual = match_custom_names_with_location_codes(s, t) - expected = { - "comment": "Custom names with location code", - "location": "RER w/o DE+NL+NO", - "irrigation": True, - } - assert actual == expected - - -def test_match_custom_names_with_location_codes_no_match(): - s = Flow({"name": "Ersatz water, RER w/o DE+NL+NO", "context": "air", "unit": "kg"}) - t = Flow( - {"name": "water, unspecified natural origin", "context": "air", "unit": "kg"} - ) - assert match_custom_names_with_location_codes(s, t) is None - - -def test_match_custom_names_with_location_codes_conversion(): - s = Flow({"name": "Water, well, HU", "context": "air", "unit": "kilogram"}) - t = Flow( - {"name": "Water, well, in ground", "context": "air", "unit": "cubic_meter"} - ) - - actual = match_custom_names_with_location_codes(s, t) - expected = { - "comment": "Custom names with location code", - "location": "HU", - "conversion_factor": 0.001, - } - assert actual == expected - - s = Flow({"name": "Water, well, HU", "context": "air", "unit": "cubic_meter"}) - t = Flow({"name": "Water, well, in ground", "context": "air", "unit": "kilogram"}) - - actual = match_custom_names_with_location_codes(s, t) - expected = { - "comment": "Custom names with location code", - "location": "HU", - "conversion_factor": 1000.0, - } - assert actual == expected diff --git a/tests/test_match_identical_cas_numbers.py b/tests/test_match_identical_cas_numbers.py deleted file mode 100644 index 018baac..0000000 --- a/tests/test_match_identical_cas_numbers.py +++ /dev/null @@ -1,74 +0,0 @@ -from deepdiff import DeepDiff - -from flowmapper.flow import Flow -from flowmapper.match import match_identical_cas_numbers - - -def test_match_identical_cas_numbers(transformations): - source = { - "name": "1-Propanol", - "CAS number": "000071-23-8", - "checmical formula": "", - "Synonyms": "1-Propanol", - "unit": "kg", - "Class": "Waterborne emissions", - "context": "Emissions to water/groundwater", - "Flow UUID": "8C31919B-2D42-4CAD-A10E-8084CCD6BE99", - "Description": "Formula: C3H8O\u007f", - } - - target = { - "name": "Propanol", - "CAS number": "000071-23-8", - "checmical formula": "", - "Synonyms": "propan-1-ol, 1-propanol, propyl alcohol, n-propanol, n-propyl alcohol", - "unit": "kg", - "Class": "chemical", - "ExternalReference": "", - "Preferred": "", - "context": "water/ground-", - "identifier": "85500204-9d88-40ae-9f0b-3ceba0e7a74f", - "AltUnit": "", - "Var": "", - "Second CAS": "71-31-8; 19986-23-3; 71-23-8; 64118-40-7; 4712-36-1; 142583-61-7; 71-23-8", - } - - s = Flow(source, transformations) - t = Flow(target, transformations) - - assert match_identical_cas_numbers(s, t) - - -def test_match_missing_cas_numbers(transformations): - source = { - "name": "1-Propanol", - "CAS number": "", - "checmical formula": "", - "synonyms": "1-Propanol", - "unit": "kg", - "Class": "Waterborne emissions", - "context": "Emissions to water/groundwater", - "identifier": "8C31919B-2D42-4CAD-A10E-8084CCD6BE99", - "Description": "Formula: C3H8O\u007f", - } - - target = { - "name": "Propanol", - "CAS number": "", - "checmical formula": "", - "synonyms": "propan-1-ol, 1-propanol, propyl alcohol, n-propanol, n-propyl alcohol", - "unit": "kg", - "Class": "chemical", - "ExternalReference": "", - "Preferred": "", - "context": "water/ground-", - "identifier": "85500204-9d88-40ae-9f0b-3ceba0e7a74f", - "AltUnit": "", - "Var": "", - "Second CAS": "71-31-8; 19986-23-3; 71-23-8; 64118-40-7; 4712-36-1; 142583-61-7; 71-23-8", - } - - s = Flow(source, transformations) - t = Flow(target, transformations) - - assert not match_identical_cas_numbers(s, t) diff --git a/tests/test_match_identical_names.py b/tests/test_match_identical_names.py deleted file mode 100644 index af712e0..0000000 --- a/tests/test_match_identical_names.py +++ /dev/null @@ -1,51 +0,0 @@ -from deepdiff import DeepDiff - -from flowmapper.flow import Flow -from flowmapper.match import match_identical_names - - -def test_match_identical_names(transformations): - source = { - "name": "Carbon dioxide, in air", - "CAS No": "000124-38-9", - "unit": "kg", - "context": "Resources/in air", - "Flow UUID": "32722990-B7D8-44A8-BC7D-EC3A89F533FF", - } - - target = { - "name": "Carbon dioxide, in air", - "CAS number": "000124-38-9", - "unit": "kg", - "context": "natural resource/in air", - "identifier": "cc6a1abb-b123-4ca6-8f16-38209df609be", - } - - s = Flow(source, transformations) - t = Flow(target, transformations) - - match = match_identical_names(s, t) - assert match - - -def test_match_identical_names_jsonpath(transformations): - source = { - "name": "Carbon dioxide, in air", - "context": ["Raw", "(unspecified)"], - "unit": "kg", - "CAS": "000124-38-9", - } - - target = { - "identifier": "cc6a1abb-b123-4ca6-8f16-38209df609be", - "CAS number": "000124-38-9", - "name": "Carbon dioxide, in air", - "unit": "kg", - "context": ["natural resource", "in air"], - } - - s = Flow(source, transformations) - t = Flow(target, transformations) - - match = match_identical_names(s, t) - assert not match diff --git a/tests/test_match_identical_names_except_missing_suffix.py b/tests/test_match_identical_names_except_missing_suffix.py deleted file mode 100644 index 0fe7f89..0000000 --- a/tests/test_match_identical_names_except_missing_suffix.py +++ /dev/null @@ -1,43 +0,0 @@ -from flowmapper.flow import Flow -from flowmapper.match import match_identical_names_except_missing_suffix - - -def test_match_identical_names_except_missing_suffix(transformations): - source = { - "name": "Copper", - "CAS number": "007440-50-8", - "unit": "kg", - "context": "Emissions to water/groundwater", - "identifier": "F277F190-A8A4-4A2D-AAF6-F6CB3772A545", - } - target = { - "name": "Copper, ion", - "CAS number": "017493-86-6", - "unit": "kg", - "context": "water/ground-", - "identifier": "c3b659e5-35f1-408c-8cb5-b5f9b295c76e", - } - - s = Flow(source, transformations) - t = Flow(target, transformations) - - assert match_identical_names_except_missing_suffix(s, t, suffix="ion") - - -def test_match_identical_names_except_missing_suffix_different_order(transformations): - s = Flow( - {"name": "Iron, ion", "unit": "g", "context": ["Emissions to air", ""]}, - transformations, - ) - t = Flow( - { - "identifier": "8dba66e2-0f2e-4038-84ef-1e40b4f573a6", - "CAS number": "007439-89-6", - "name": "Iron", - "unit": "kg", - "context": ["air", "unspecified"], - }, - transformations, - ) - - assert match_identical_names_except_missing_suffix(s, t, suffix="ion") diff --git a/tests/test_match_identical_names_in_synonyms.py b/tests/test_match_identical_names_in_synonyms.py deleted file mode 100644 index 12adefa..0000000 --- a/tests/test_match_identical_names_in_synonyms.py +++ /dev/null @@ -1,32 +0,0 @@ -from flowmapper.flow import Flow -from flowmapper.match import match_identical_names_in_synonyms - - -def test_match_identical_names_in_synonyms(transformations): - source = { - "name": "Sulfuric acid", - "unit": "kg", - "context": ["Emissions to water", ""], - } - - target = { - "identifier": "8570c45a-8c78-4709-9b8f-fb88314d9e9d", - "chemical formula": "H8N2O4S", - "CAS number": "007783-20-2", - "name": "Ammonium sulfate", - "unit": "kg", - "context": ["water", "unspecified"], - "synonyms": [ - "Diammonium sulfate", - "Mascagnite", - "Sulfuric acid", - "Actamaster", - "Diammonium salt", - "Dolamin", - ], - } - - s = Flow(source, transformations) - t = Flow(target, transformations) - - assert match_identical_names_in_synonyms(s, t) diff --git a/tests/test_match_names_with_country_codes.py b/tests/test_match_names_with_country_codes.py deleted file mode 100644 index 0525067..0000000 --- a/tests/test_match_names_with_country_codes.py +++ /dev/null @@ -1,64 +0,0 @@ -from flowmapper.flow import Flow -from flowmapper.match import match_names_with_location_codes - - -def test_match_names_with_country_codes(): - s = Flow({"name": "Ammonia, NL", "context": "air", "unit": "kg"}) - t = Flow({"name": "Ammonia", "context": "air", "unit": "kg"}) - - actual = match_names_with_location_codes(s, t) - expected = {"comment": "Name matching with location code", "location": "NL"} - assert actual == expected - - -def test_match_names_with_country_codes_extra_whitespace(): - s = Flow({"name": "Ammonia, \tNL", "context": "air", "unit": "kg"}) - t = Flow({"name": "Ammonia", "context": "air", "unit": "kg"}) - - actual = match_names_with_location_codes(s, t) - expected = {"comment": "Name matching with location code", "location": "NL"} - assert actual == expected - - -def test_match_names_with_country_codes_no_match(): - s = Flow({"name": "Ammonia-NL", "context": "air", "unit": "kg"}) - t = Flow({"name": "Ammonia", "context": "air", "unit": "kg"}) - assert match_names_with_location_codes(s, t) is None - - -def test_match_names_with_country_codes_complicated_location(): - s = Flow({"name": "Ammonia, RER w/o DE+NL+NO", "context": "air", "unit": "kg"}) - t = Flow({"name": "Ammonia", "context": "air", "unit": "kg"}) - - actual = match_names_with_location_codes(s, t) - expected = { - "comment": "Name matching with location code", - "location": "RER w/o DE+NL+NO", - } - assert actual == expected - - -def test_match_names_with_country_codes_water_source_conversion(): - s = Flow({"name": "Water, NL", "context": "air", "unit": "kilogram"}) - t = Flow({"name": "Water", "context": "air", "unit": "cubic_meter"}) - - actual = match_names_with_location_codes(s, t) - expected = { - "comment": "Name matching with location code", - "location": "NL", - "conversion_factor": 0.001, - } - assert actual == expected - - -def test_match_names_with_country_codes_water_target_conversion(): - s = Flow({"name": "Water, NL", "context": "air", "unit": "cubic_meter"}) - t = Flow({"name": "Water", "context": "air", "unit": "kilogram"}) - - actual = match_names_with_location_codes(s, t) - expected = { - "comment": "Name matching with location code", - "location": "NL", - "conversion_factor": 1000.0, - } - assert actual == expected diff --git a/tests/test_match_non_ionic_state.py b/tests/test_match_non_ionic_state.py deleted file mode 100644 index 3b469cf..0000000 --- a/tests/test_match_non_ionic_state.py +++ /dev/null @@ -1,48 +0,0 @@ -from flowmapper.flow import Flow -from flowmapper.flowmap import Flowmap - - -def test_match_non_ionic_state(): - s = [ - Flow({"name": "Mercury (II)", "context": "air", "unit": "kg"}), - Flow({"name": "Manganese (II)", "context": "air", "unit": "kg"}), - ] - t = [ - Flow({"name": "Mercury", "context": "air", "unit": "kg", "identifier": "foo"}), - Flow( - { - "name": "Manganese II", - "context": "air", - "unit": "kg", - "identifier": "bar", - } - ), - ] - - flowmap = Flowmap(s, t) - actual = flowmap.to_randonneur() - expected = [ - { - "source": {"name": "Manganese (II)", "context": "air", "unit": "kg"}, - "target": { - "identifier": "bar", - "name": "Manganese II", - "context": "air", - "unit": "kg", - }, - "conversion_factor": 1.0, - "comment": "With/without roman numerals in parentheses", - }, - { - "source": {"name": "Mercury (II)", "context": "air", "unit": "kg"}, - "target": { - "identifier": "foo", - "name": "Mercury", - "context": "air", - "unit": "kg", - }, - "conversion_factor": 1.0, - "comment": "Non-ionic state if no better match", - }, - ] - assert actual == expected diff --git a/tests/test_normalize_str.py b/tests/test_normalize_str.py index d0b058f..6ce5cad 100644 --- a/tests/test_normalize_str.py +++ b/tests/test_normalize_str.py @@ -11,4 +11,6 @@ def test_normalize_str(): " \u00dcber", None, ] - assert {normalize_str(name) for name in names} == {"über", "Über", ""} + actual = {normalize_str(name) for name in names} + expected = {"über", "Über", ""} + assert actual == expected, f"Expected actual to equal {expected}, but got {actual}" diff --git a/tests/test_preferred_synonyms.py b/tests/test_preferred_synonyms.py deleted file mode 100644 index c6a6e33..0000000 --- a/tests/test_preferred_synonyms.py +++ /dev/null @@ -1,392 +0,0 @@ -import pytest - -from flowmapper.flow import Flow -from flowmapper.preferred_synonyms import ( - has_number_pattern_at_end, - has_roman_numeral_at_end, - match_identical_names_in_preferred_synonyms, -) - - -@pytest.mark.parametrize( - "text", - [ - "Chapter I", - "Section V", - "Appendix XXI", - "Book III", - "Part IV", - "Chapter VI", - "Section VII", - "Appendix VIII", - "Appendix VIII+", - "Appendix VIII-", - "Appendix viii", - "Book IX", - "Part X", - "Chapter XI", - "Section XV", - "Appendix XX", - "Book XXX", - "Chapter II ", # Trailing space - " Chapter III ", # Leading and trailing spaces - "Chapter (I)", # With parentheses - "Section (V+)", # With parentheses and plus - "Book (III-)", # With parentheses and minus - ], -) -def test_roman_numerals_should_match(text): - """Test that valid roman numerals at the end of strings are detected.""" - assert has_roman_numeral_at_end(text) - - -@pytest.mark.parametrize( - "text", - [ - "Chapter 1", - "Appendix VIII-+", - "Section A", - "Part XL", - "Chapter L", - "Appendix C", - "Chapter DC", - "Section M", - "Part MMMCMXCIX", # 3999 - "I am at the beginning", - "This ends with I but not roman", - "", - " ", - "Chapter", - ], -) -def test_non_roman_numerals_should_not_match(text): - """Test that invalid or non-roman numerals are not detected.""" - assert not has_roman_numeral_at_end(text) - - -@pytest.mark.parametrize( - "text", - [ - "Substance (1+)", - "Compound (2-)", - "Element (3)", - "Chemical (5+)", - "Material (7-)", - "Substance (9)", - "Element (11)", # Multi-digit numbers are allowed - "Substance (1+) ", # Trailing space - " Compound (2-) ", # Leading and trailing spaces - "Element (123+)", # Multiple digits with plus - "Compound (456-)", # Multiple digits with minus - ], -) -def test_number_patterns_should_match(text): - """Test that valid number patterns at the end of strings are detected.""" - assert has_number_pattern_at_end(text) - - -@pytest.mark.parametrize( - "text", - [ - "Chemical", - "Substance 1+", # Missing parentheses - "Molecule (1+2)", - "Compound (0)", - "Chemical ()", # Empty parentheses - "Material (+)", # Just plus sign - "Substance (-)", # Just minus sign - "Element (10)", - "Substance 1-", # Missing parentheses - "Chemical (5+-)", - "Substance 1-+", # Missing parentheses - "Molecule (1+2", # Missing closing parenthesis - "Element 1+2)", # Missing opening parenthesis - "Compound (1+2", # Missing closing parenthesis - "", - " ", - "Substance (1+2) extra", # Text after pattern - "(1+) Substance", # Pattern not at end - ], -) -def test_invalid_patterns_should_not_match(text): - """Test that invalid patterns are not detected.""" - assert not has_number_pattern_at_end(text) - - -def test_match_when_target_has_source_name_in_synonyms_with_roman_numeral(): - """Test matching when target has source name in synonyms and target name ends with roman numeral.""" - source_data = { - "name": "water", - "context": ["ground"], - "unit": "kg", - "synonyms": ["h2o"], - } - target_data = { - "name": "water I", # Ends with roman numeral - "context": ["ground"], - "unit": "kg", - "synonyms": ["water", "aqua"], - } - - source = Flow(source_data) - target = Flow(target_data) - - result = match_identical_names_in_preferred_synonyms(source, target) - - assert result == {"comment": "Identical preferred synonyms"} - - -def test_match_when_target_has_source_name_in_synonyms_with_number_pattern(): - """Test matching when target has source name in synonyms and target name ends with number pattern.""" - source_data = { - "name": "carbon", - "context": ["air"], - "unit": "kg", - "synonyms": ["co2"], - } - target_data = { - "name": "carbon (2+)", # Ends with number pattern - "context": ["air"], - "unit": "kg", - "synonyms": ["carbon", "c"], - } - - source = Flow(source_data) - target = Flow(target_data) - - result = match_identical_names_in_preferred_synonyms(source, target) - - assert result == {"comment": "Identical preferred synonyms"} - - -def test_match_when_source_has_target_name_in_synonyms_with_roman_numeral(): - """Test matching when source has target name in synonyms and source name ends with roman numeral.""" - source_data = { - "name": "nitrogen II", # Ends with roman numeral - "context": ["air"], - "unit": "kg", - "synonyms": ["nitrogen", "n2"], - } - target_data = { - "name": "nitrogen", - "context": ["air"], - "unit": "kg", - "synonyms": ["n2"], - } - - source = Flow(source_data) - target = Flow(target_data) - - result = match_identical_names_in_preferred_synonyms(source, target) - - assert result == {"comment": "Identical preferred synonyms"} - - -def test_match_when_source_has_target_name_in_synonyms_with_number_pattern(): - """Test matching when source has target name in synonyms and source name ends with number pattern.""" - source_data = { - "name": "oxygen (1-)", # Ends with number pattern - "context": ["air"], - "unit": "kg", - "synonyms": ["oxygen", "o2"], - } - target_data = { - "name": "oxygen", - "context": ["air"], - "unit": "kg", - "synonyms": ["n2"], - } - - source = Flow(source_data) - target = Flow(target_data) - - result = match_identical_names_in_preferred_synonyms(source, target) - - assert result == {"comment": "Identical preferred synonyms"} - - -def test_no_match_when_different_contexts(): - """Test that no match occurs when contexts are different.""" - source_data = { - "name": "water", - "context": ["ground"], - "unit": "kg", - "synonyms": ["h2o"], - } - target_data = { - "name": "water I", - "context": ["air"], # Different context - "unit": "kg", - "synonyms": ["water", "aqua"], - } - - source = Flow(source_data) - target = Flow(target_data) - - result = match_identical_names_in_preferred_synonyms(source, target) - - assert result is None - - -def test_no_match_when_name_not_in_synonyms(): - """Test that no match occurs when name is not in synonyms.""" - source_data = { - "name": "water", - "context": ["ground"], - "unit": "kg", - "synonyms": ["h2o"], - } - target_data = { - "name": "water I", - "context": ["ground"], - "unit": "kg", - "synonyms": ["aqua", "liquid"], # "water" not in synonyms - } - - source = Flow(source_data) - target = Flow(target_data) - - result = match_identical_names_in_preferred_synonyms(source, target) - - assert result is None - - -def test_no_match_when_no_roman_numeral_or_number_pattern(): - """Test that no match occurs when name doesn't end with roman numeral or number pattern.""" - source_data = { - "name": "water", - "context": ["ground"], - "unit": "kg", - "synonyms": ["h2o"], - } - target_data = { - "name": "water", # No roman numeral or number pattern - "context": ["ground"], - "unit": "kg", - "synonyms": ["water", "aqua"], - } - - source = Flow(source_data) - target = Flow(target_data) - - result = match_identical_names_in_preferred_synonyms(source, target) - - assert result is None - - -def test_no_match_when_name_not_contained_in_other_name(): - """Test that no match occurs when one name is not contained in the other.""" - source_data = { - "name": "water", - "context": ["ground"], - "unit": "kg", - "synonyms": ["h2o"], - } - target_data = { - "name": "different I", # "water" not contained in "different_water I" - "context": ["ground"], - "unit": "kg", - "synonyms": ["water", "aqua"], - } - - source = Flow(source_data) - target = Flow(target_data) - - result = match_identical_names_in_preferred_synonyms(source, target) - - assert result is None - - -def test_no_match_when_no_synonyms(): - """Test that no match occurs when flows have no synonyms.""" - source_data = { - "name": "water", - "context": ["ground"], - "unit": "kg", - "synonyms": [], # No synonyms - } - target_data = { - "name": "water I", - "context": ["ground"], - "unit": "kg", - "synonyms": [], # No synonyms - } - - source = Flow(source_data) - target = Flow(target_data) - - result = match_identical_names_in_preferred_synonyms(source, target) - - assert result is None - - -def test_custom_comment(): - """Test that custom comment is returned when provided.""" - source_data = { - "name": "water", - "context": ["ground"], - "unit": "kg", - "synonyms": ["h2o"], - } - target_data = { - "name": "water I", - "context": ["ground"], - "unit": "kg", - "synonyms": ["water", "aqua"], - } - - source = Flow(source_data) - target = Flow(target_data) - - custom_comment = "Custom match comment" - result = match_identical_names_in_preferred_synonyms( - source, target, custom_comment - ) - - assert result == {"comment": custom_comment} - - -def test_match_with_roman_numeral_and_plus_minus(): - """Test matching with roman numerals that have + or - signs.""" - source_data = { - "name": "iron", - "context": ["ground"], - "unit": "kg", - "synonyms": ["fe"], - } - target_data = { - "name": "iron II+", # Roman numeral with plus - "context": ["ground"], - "unit": "kg", - "synonyms": ["iron", "fe"], - } - - source = Flow(source_data) - target = Flow(target_data) - - result = match_identical_names_in_preferred_synonyms(source, target) - - assert result == {"comment": "Identical preferred synonyms"} - - -def test_match_with_number_pattern_and_plus_minus(): - """Test matching with number patterns that have + or - signs.""" - source_data = { - "name": "sodium", - "context": ["ground"], - "unit": "kg", - "synonyms": ["na"], - } - target_data = { - "name": "sodium (1+)", # Number pattern with plus - "context": ["ground"], - "unit": "kg", - "synonyms": ["sodium", "na"], - } - - source = Flow(source_data) - target = Flow(target_data) - - result = match_identical_names_in_preferred_synonyms(source, target) - - assert result == {"comment": "Identical preferred synonyms"} diff --git a/tests/test_prepare_transformations.py b/tests/test_prepare_transformations.py deleted file mode 100644 index 9ea7a35..0000000 --- a/tests/test_prepare_transformations.py +++ /dev/null @@ -1,2 +0,0 @@ -# TBD -# Also include pydantic stuff diff --git a/tests/test_rm_parentheses_roman_numerals.py b/tests/test_rm_parentheses_roman_numerals.py deleted file mode 100644 index 94fa177..0000000 --- a/tests/test_rm_parentheses_roman_numerals.py +++ /dev/null @@ -1,24 +0,0 @@ -from flowmapper.utils import ( - rm_parentheses_roman_numerals, - rm_roman_numerals_ionic_state, -) - - -def test_rm_parentheses_roman_numerals(): - assert rm_parentheses_roman_numerals("chromium (iii)") == "chromium iii" - assert rm_parentheses_roman_numerals("chromium ( iii )") == "chromium iii" - assert ( - rm_parentheses_roman_numerals("water (evapotranspiration)") - == "water (evapotranspiration)" - ) - assert rm_parentheses_roman_numerals("metolachlor, (s)") == "metolachlor, (s)" - assert rm_parentheses_roman_numerals("chromium (vi)") == "chromium vi" - assert rm_parentheses_roman_numerals("beryllium (ii)") == "beryllium ii" - assert rm_parentheses_roman_numerals("thallium (i)") == "thallium i" - assert rm_parentheses_roman_numerals("tin (iv) oxide") == "tin iv oxide" - - -def test_rm_roman_numerals_ionic_state(): - assert rm_roman_numerals_ionic_state("mercury (ii)") == "mercury" - assert rm_roman_numerals_ionic_state("manganese (ii)") == "manganese" - assert rm_roman_numerals_ionic_state("molybdenum (vi)") == "molybdenum" diff --git a/tests/test_stringfield.py b/tests/test_stringfield.py deleted file mode 100644 index f59d04a..0000000 --- a/tests/test_stringfield.py +++ /dev/null @@ -1,70 +0,0 @@ -from flowmapper.string_field import StringField - - -def test_string_field_empty(): - sf = StringField(None) - assert sf.original is None - assert sf.normalized == "" - assert sf != "" - assert sf != "a" - assert sf != StringField("a") - assert sf is not None - assert not sf - assert repr(sf) == "StringField with missing original value" - - -def test_string_field_no_transformed(): - sf = StringField("A", use_lowercase=False) - assert sf.original == "A" - assert sf.normalized == "A" - assert sf == "A" - assert sf != "a" - assert sf == StringField("A", use_lowercase=True) - assert sf == StringField("A", use_lowercase=False) - assert sf != "B" - assert not sf.use_lowercase - assert sf - assert repr(sf) == "StringField: 'A' -> 'A'" - - -def test_string_field_no_transformed_lowercase(): - sf = StringField("A", use_lowercase=True) - assert sf.original == "A" - assert sf.normalized == "a" - assert sf == "a" - assert sf == "A" - assert sf == StringField("A", use_lowercase=True) - assert sf == StringField("A", use_lowercase=False) - assert sf != "B" - assert sf.use_lowercase - assert sf - assert repr(sf) == "StringField: 'A' -> 'a'" - - -def test_string_field_transformed(): - sf = StringField("A*", use_lowercase=False) - assert sf.original == "A*" - assert sf.normalized == "A*" - assert sf != "A" - assert sf != "a*" - assert sf == "A*" - assert sf == StringField("A*", use_lowercase=True) - assert sf == StringField("A*", use_lowercase=False) - assert sf != "B" - assert not sf.use_lowercase - assert sf - assert repr(sf) == "StringField: 'A*' -> 'A*'" - - -def test_string_field_transformed_lowercase(): - sf = StringField("A*", use_lowercase=True) - assert sf.original == "A*" - assert sf.normalized == "a*" - assert sf == "a*" - assert sf == "A*" - assert sf == StringField("A*", use_lowercase=True) - assert sf == StringField("A*", use_lowercase=False) - assert sf != "B" - assert sf.use_lowercase - assert sf - assert repr(sf) == "StringField: 'A*' -> 'a*'" diff --git a/tests/test_stringlist.py b/tests/test_stringlist.py deleted file mode 100644 index 88e2dee..0000000 --- a/tests/test_stringlist.py +++ /dev/null @@ -1,41 +0,0 @@ -from flowmapper.string_list import StringList - - -def test_string_list_empty(): - sl = StringList([]) - assert sl.data == [] - assert list(iter(sl)) == [] - assert len(sl) == 0 - assert not sl - assert repr(sl) == "StringList: Empty" - assert 1 not in sl - - -def test_string_list_no_transformed(): - sl = StringList(["A", "b"]) - assert "A" in sl - assert "b" in sl - assert len(sl) == 2 - assert sl - assert ( - repr(sl) - == "StringList: [\"StringField: 'A' -> 'a'\", \"StringField: 'b' -> 'b'\"]" - ) - assert list(iter(sl)) == ["a", "b"] - assert sl.data[0].original == "A" - assert sl.data[0].normalized == "a" - - -def test_string_list_transformed(): - sl = StringList(["A", "b"], ["A*", "b"]) - assert "A*" in sl - assert "b" in sl - assert len(sl) == 2 - assert sl - assert ( - repr(sl) - == "StringList: [\"StringField: 'A' -> 'a*'\", \"StringField: 'b' -> 'b'\"]" - ) - assert list(iter(sl)) == ["a*", "b"] - assert sl.data[0].original == "A" - assert sl.data[0].normalized == "a*" diff --git a/tests/test_transform_and_then_match.py b/tests/test_transform_and_then_match.py new file mode 100644 index 0000000..6d13d5f --- /dev/null +++ b/tests/test_transform_and_then_match.py @@ -0,0 +1,678 @@ +"""Tests for transform_and_then_match function.""" + +from copy import copy + +import pytest + +from flowmapper.domain.flow import Flow +from flowmapper.domain.normalized_flow import NormalizedFlow +from flowmapper.matching import match_identical_names, transform_and_then_match + + +def test_transform_and_then_match_basic(): + """Test basic matching without transformations or filters.""" + source_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + target_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + + source_flow = Flow.from_dict(source_data) + target_flow = Flow.from_dict(target_data) + source_normalized = source_flow.normalize() + target_normalized = target_flow.normalize() + + source_flows = [ + NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + ] + target_flows = [ + NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + ] + + matches = transform_and_then_match( + source_flows=source_flows, + target_flows=target_flows, + match_function=match_identical_names, + ) + + assert len(matches) == 1, "Expected one match" + assert matches[0].source == source_flow, "Expected match to reference source flow" + assert matches[0].target == target_flow, "Expected match to reference target flow" + + # Verify flows are reset + assert ( + source_flows[0].current.name.data == source_normalized.name.data + ), "Expected source flow to be reset" + assert ( + target_flows[0].current.name.data == target_normalized.name.data + ), "Expected target flow to be reset" + + +def test_transform_and_then_match_with_transformation(): + """Test matching with transformations applied.""" + source_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + target_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + + source_flow = Flow.from_dict(source_data) + target_flow = Flow.from_dict(target_data) + source_normalized = source_flow.normalize() + target_normalized = target_flow.normalize() + + source_flows = [ + NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + ] + target_flows = [ + NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + ] + + def transform_func(flows): + for flow in flows: + flow.update_current(name="Modified name") + return flows + + matches = transform_and_then_match( + source_flows=source_flows, + target_flows=target_flows, + match_function=match_identical_names, + transform_source_flows=[transform_func], + transform_target_flows=[transform_func], + ) + + # Should match because both are transformed to "Modified name" + assert len(matches) == 1, "Expected one match after transformation" + + # Verify flows are reset + assert ( + source_flows[0].current.name.data == source_normalized.name.data + ), f"Expected source flow to be reset after transformation, got {source_flows[0].current.name.data!r} != {source_normalized.name.data!r}" + assert ( + target_flows[0].current.name.data == target_normalized.name.data + ), f"Expected target flow to be reset after transformation, got {target_flows[0].current.name.data!r} != {target_normalized.name.data!r}" + + +def test_transform_and_then_match_with_filter(): + """Test matching with filters applied.""" + source_data1 = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + source_data2 = { + "name": "Water", + "context": "water", + "unit": "kg", + } + target_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + + source_flow1 = Flow.from_dict(source_data1) + source_flow2 = Flow.from_dict(source_data2) + target_flow = Flow.from_dict(target_data) + + source_flows = [ + NormalizedFlow( + original=source_flow1, + normalized=source_flow1.normalize(), + current=copy(source_flow1.normalize()), + ), + NormalizedFlow( + original=source_flow2, + normalized=source_flow2.normalize(), + current=copy(source_flow2.normalize()), + ), + ] + target_flows = [ + NormalizedFlow( + original=target_flow, + normalized=target_flow.normalize(), + current=copy(target_flow.normalize()), + ) + ] + + def filter_air_flows(flows): + return [f for f in flows if "air" in str(f.current.context)] + + matches = transform_and_then_match( + source_flows=source_flows, + target_flows=target_flows, + match_function=match_identical_names, + filter_source_flows=filter_air_flows, + ) + + # Should match only the carbon dioxide flow (air context), not water + assert len(matches) == 1, "Expected one match after filtering" + assert ( + matches[0].source == source_flow1 + ), "Expected match to reference filtered source flow" + + # Verify all flows are reset (including the filtered one) + assert ( + source_flows[0].current.name.data == source_flow1.normalize().name.data + ), "Expected first source flow to be reset" + assert ( + source_flows[1].current.name.data == source_flow2.normalize().name.data + ), "Expected second source flow to be reset" + + +def test_transform_and_then_match_with_transform_and_filter(): + """Test matching with both transformations and filters.""" + source_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + target_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + + source_flow = Flow.from_dict(source_data) + target_flow = Flow.from_dict(target_data) + source_normalized = source_flow.normalize() + target_normalized = target_flow.normalize() + + source_flows = [ + NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + ] + target_flows = [ + NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + ] + + def transform_func(flows): + for flow in flows: + flow.update_current(name="Transformed name") + return flows + + def filter_func(flows): + # Filter to only flows with "Transformed" in name + return [f for f in flows if "Transformed" in f.current.name.data] + + matches = transform_and_then_match( + source_flows=source_flows, + target_flows=target_flows, + match_function=match_identical_names, + transform_source_flows=[transform_func], + transform_target_flows=[transform_func], + filter_source_flows=filter_func, + filter_target_flows=filter_func, + ) + + # Should match because both are transformed and pass filter + assert len(matches) == 1, "Expected one match after transformation and filtering" + + # Verify flows are reset + assert ( + source_flows[0].current.name.data == source_normalized.name.data + ), "Expected source flow to be reset" + + +def test_transform_and_then_match_resets_on_exception(): + """Test that flows are NOT reset when match function raises exception.""" + source_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + target_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + + source_flow = Flow.from_dict(source_data) + target_flow = Flow.from_dict(target_data) + source_normalized = source_flow.normalize() + target_normalized = target_flow.normalize() + + source_flows = [ + NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + ] + target_flows = [ + NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + ] + + def transform_func(flows): + for flow in flows: + flow.update_current(name="Modified") + return flows + + def failing_match_function(source_flows, target_flows): + raise ValueError("Test exception") + + try: + transform_and_then_match( + source_flows=source_flows, + target_flows=target_flows, + match_function=failing_match_function, + transform_source_flows=[transform_func], + transform_target_flows=[transform_func], + ) + except ValueError: + pass + + # Verify flows are NOT reset when exception occurs + # (This documents current behavior - flows are only reset on success) + assert ( + source_flows[0].current.name.data == "Modified" + ), "Expected source flow to NOT be reset when exception occurs" + assert ( + target_flows[0].current.name.data == "Modified" + ), "Expected target flow to NOT be reset when exception occurs" + + +def test_transform_and_then_match_only_source_transformation(): + """Test matching with only source flow transformation.""" + source_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + target_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + + source_flow = Flow.from_dict(source_data) + target_flow = Flow.from_dict(target_data) + source_normalized = source_flow.normalize() + target_normalized = target_flow.normalize() + + source_flows = [ + NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + ] + target_flows = [ + NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + ] + + def transform_source(flows): + for flow in flows: + flow.update_current(name="Modified source") + return flows + + matches = transform_and_then_match( + source_flows=source_flows, + target_flows=target_flows, + match_function=match_identical_names, + transform_source_flows=[transform_source], + ) + + # Should not match because only source is transformed + assert len(matches) == 0, "Expected no match when only source is transformed" + + # Verify flows are reset + assert ( + source_flows[0].current.name.data == source_normalized.name.data + ), "Expected source flow to be reset" + + +def test_transform_and_then_match_filter_returns_empty_list(): + """Test matching when filter returns empty list.""" + source_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + target_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + + source_flow = Flow.from_dict(source_data) + target_flow = Flow.from_dict(target_data) + source_normalized = source_flow.normalize() + target_normalized = target_flow.normalize() + + source_flows = [ + NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + ] + target_flows = [ + NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + ] + + def filter_nothing(flows): + return [] + + matches = transform_and_then_match( + source_flows=source_flows, + target_flows=target_flows, + match_function=match_identical_names, + filter_source_flows=filter_nothing, + ) + + # Should have no matches because filter returns empty list + assert len(matches) == 0, "Expected no matches when filter returns empty list" + + # Verify flows are still reset + assert ( + source_flows[0].current.name.data == source_normalized.name.data + ), "Expected source flow to be reset even when filtered out" + + +def test_transform_and_then_match_with_list_of_transformations(): + """Test matching with a list of transformations applied in sequence.""" + source_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + target_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + + source_flow = Flow.from_dict(source_data) + target_flow = Flow.from_dict(target_data) + source_normalized = source_flow.normalize() + target_normalized = target_flow.normalize() + + source_flows = [ + NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + ] + target_flows = [ + NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + ] + + def transform1(flows): + for flow in flows: + flow.update_current(name="First transformation") + return flows + + def transform2(flows): + for flow in flows: + flow.update_current(name="Second transformation") + return flows + + matches = transform_and_then_match( + source_flows=source_flows, + target_flows=target_flows, + match_function=match_identical_names, + transform_source_flows=[transform1, transform2], + transform_target_flows=[transform1, transform2], + ) + + # Should match because both are transformed through the same sequence + assert len(matches) == 1, "Expected one match after multiple transformations" + + # Verify flows are reset + assert ( + source_flows[0].current.name.data == source_normalized.name.data + ), "Expected source flow to be reset after transformations" + assert ( + target_flows[0].current.name.data == target_normalized.name.data + ), "Expected target flow to be reset after transformations" + + +def test_transform_and_then_match_list_transformations_sequence(): + """Test that list transformations are applied in the correct sequence.""" + source_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + target_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + + source_flow = Flow.from_dict(source_data) + target_flow = Flow.from_dict(target_data) + source_normalized = source_flow.normalize() + target_normalized = target_flow.normalize() + + source_flows = [ + NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + ] + target_flows = [ + NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + ] + + # Track transformation order + transform_order = [] + + def transform1(flows): + transform_order.append("transform1") + for flow in flows: + flow.update_current(name="Transform1") + return flows + + def transform2(flows): + transform_order.append("transform2") + for flow in flows: + flow.update_current(name="Transform2") + return flows + + def transform3(flows): + transform_order.append("transform3") + for flow in flows: + flow.update_current(name="Transform3") + return flows + + # Apply transformations in sequence + matches = transform_and_then_match( + source_flows=source_flows, + target_flows=target_flows, + match_function=match_identical_names, + transform_source_flows=[transform1, transform2, transform3], + transform_target_flows=[transform1, transform2, transform3], + ) + + # Verify transformations were applied in order + assert transform_order == [ + "transform1", + "transform2", + "transform3", + "transform1", + "transform2", + "transform3", + ], f"Expected transformations in order, got {transform_order}" + + # Final name should be from transform3 + # But we need to check during matching, so let's verify the match happened + assert len(matches) == 1, "Expected match after sequential transformations" + + +def test_transform_and_then_match_single_function_still_works(): + """Test that single function transformation works when wrapped in a list.""" + source_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + target_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + + source_flow = Flow.from_dict(source_data) + target_flow = Flow.from_dict(target_data) + source_normalized = source_flow.normalize() + target_normalized = target_flow.normalize() + + source_flows = [ + NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + ] + target_flows = [ + NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + ] + + def transform_func(flows): + for flow in flows: + flow.update_current(name="Single transform") + return flows + + # Test with single function wrapped in list + matches = transform_and_then_match( + source_flows=source_flows, + target_flows=target_flows, + match_function=match_identical_names, + transform_source_flows=[transform_func], + transform_target_flows=[transform_func], + ) + + # Should match because both are transformed + assert len(matches) == 1, "Expected one match with single transformation function" + + # Verify flows are reset + assert ( + source_flows[0].current.name.data == source_normalized.name.data + ), "Expected source flow to be reset" + + +def test_transform_and_then_match_mixed_single_and_list(): + """Test matching with single function for source and list for target.""" + source_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + target_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + + source_flow = Flow.from_dict(source_data) + target_flow = Flow.from_dict(target_data) + source_normalized = source_flow.normalize() + target_normalized = target_flow.normalize() + + source_flows = [ + NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + ] + target_flows = [ + NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + ] + + def single_transform(flows): + for flow in flows: + flow.update_current(name="Single") + return flows + + def list_transform1(flows): + for flow in flows: + flow.update_current(name="List1") + return flows + + def list_transform2(flows): + for flow in flows: + flow.update_current(name="List2") + return flows + + # Source: single function in list, Target: list of functions + matches = transform_and_then_match( + source_flows=source_flows, + target_flows=target_flows, + match_function=match_identical_names, + transform_source_flows=[single_transform], + transform_target_flows=[list_transform1, list_transform2], + ) + + # Should not match because names differ: "Single" vs "List2" + assert len(matches) == 0, "Expected no match when transformations differ" + + # Verify flows are reset + assert ( + source_flows[0].current.name.data == source_normalized.name.data + ), "Expected source flow to be reset" diff --git a/tests/test_transform_flow.py b/tests/test_transform_flow.py deleted file mode 100644 index 339d7bb..0000000 --- a/tests/test_transform_flow.py +++ /dev/null @@ -1,123 +0,0 @@ -import json -from pathlib import Path - -from flowmapper.flow import Flow -from flowmapper.flowmap import Flowmap -from flowmapper.transformation_mapping import prepare_transformations - -DATA_DIR = Path(__file__).parent / "data" - - -def test_transform_flow_without_default_transformations(): - transformations = prepare_transformations( - [json.load(open(DATA_DIR / "transformations.json"))] - ) - source_flows = json.load(open(DATA_DIR / "sp.json")) - source_flows = [Flow(flow, transformations) for flow in source_flows] - target_flows = json.load(open(DATA_DIR / "ei-3.7.json")) - target_flows = [Flow(flow, transformations) for flow in target_flows] - - flowmap = Flowmap(source_flows, target_flows) - actual = flowmap.to_randonneur() - - expected = [ - { - "source": { - "name": "1,4-Butanediol", - "unit": "kg", - "context": "air", - "CAS number": "110-63-4", - }, - "target": { - "name": "1,4-Butanediol", - "unit": "kg", - "identifier": "09db39be-d9a6-4fc3-8d25-1f80b23e9131", - "context": ["air", "unspecified"], - "CAS number": "110-63-4", - }, - "conversion_factor": 1.0, - "comment": "Identical names", - }, - { - "source": { - "name": "1,4-Butanediol", - "unit": "kg", - "context": "air/high. pop.", - "CAS number": "110-63-4", - }, - "target": { - "name": "1,4-Butanediol", - "unit": "kg", - "identifier": "09db39be-d9a6-4fc3-8d25-1f80b23e9131", - "context": ["air", "unspecified"], - "CAS number": "110-63-4", - }, - "conversion_factor": 1.0, - "comment": "Identical names", - }, - ] - assert actual == expected - - -def test_transform_flow_with_default_transformations(transformations): - all_transformations = transformations + prepare_transformations( - [json.load(open(DATA_DIR / "transformations.json"))] - ) - source_flows = json.load(open(DATA_DIR / "sp.json")) - source_flows = [Flow(flow, all_transformations) for flow in source_flows] - target_flows = json.load(open(DATA_DIR / "ei-3.7.json")) - target_flows = [Flow(flow, all_transformations) for flow in target_flows] - - flowmap = Flowmap(source_flows, target_flows) - actual = flowmap.to_randonneur() - - expected = [ - { - "comment": "Identical names", - "conversion_factor": 1.0, - "source": { - "CAS number": "110-63-4", - "context": "air", - "name": "1,4-Butanediol", - "unit": "kg", - }, - "target": { - "CAS number": "110-63-4", - "context": ["air", "unspecified"], - "identifier": "09db39be-d9a6-4fc3-8d25-1f80b23e9131", - "name": "1,4-Butanediol", - "unit": "kg", - }, - }, - { - "comment": "Identical names", - "conversion_factor": 1.2142857142857142, - "source": { - "context": "air/low. pop.", - "name": "Ammonia, as N", - "unit": "kg", - }, - "target": { - "CAS number": "7664-41-7", - "context": ["air", "non-urban air or from high stacks"], - "identifier": "0f440cc0-0f74-446d-99d6-8ff0e97a2444", - "name": "Ammonia", - "unit": "kg", - }, - }, - { - "comment": "Name matching with location code", - "conversion_factor": 1.0, - "source": {"context": "air/low. pop.", "name": "Ammonia, FR", "unit": "kg"}, - "target": { - "CAS number": "7664-41-7", - "context": ["air", "non-urban air or from high stacks"], - "identifier": "0f440cc0-0f74-446d-99d6-8ff0e97a2444", - "location": "FR", - "name": "Ammonia", - "unit": "kg", - }, - }, - ] - - assert actual == expected diff --git a/tests/test_unit.py b/tests/test_unit.py deleted file mode 100644 index f1e395e..0000000 --- a/tests/test_unit.py +++ /dev/null @@ -1,90 +0,0 @@ -import math - -from flowmapper.transformation_mapping import prepare_transformations -from flowmapper.unit import UnitField -from flowmapper.utils import apply_transformations, load_standard_transformations - - -def test_equals_with_loaded_transformation(): - transformations = prepare_transformations(load_standard_transformations()) - - a = {"unit": "M2A"} - a_t = apply_transformations(a, transformations) - b = {"unit": "m2*year"} - b_t = apply_transformations(b, transformations) - - u1 = UnitField(a["unit"], a_t["unit"]) - u2 = UnitField(b["unit"], b_t["unit"]) - - assert u1 == u2 - - -def test_equals_mass(): - u1 = UnitField("kg") - u2 = UnitField("kilogram") - - assert u1 == u2 - - -def test_energy(): - u1 = UnitField("kilowatt hour") - u2 = UnitField("MJ") - assert u1.compatible(u2) - assert u1.conversion_factor(u2) == 3.6 - - -def test_enrichment(): - u1 = UnitField("SWU") - u2 = UnitField("tonne * SW") - assert u1.compatible(u2) - assert u1.conversion_factor(u2) == 1e-3 - - -def test_natural_gas(): - u1 = UnitField("nm3") - u2 = UnitField("sm3") - assert u1.compatible(u2) - - -def test_livestock(): - u1 = UnitField("LU") - u2 = UnitField("livestock unit") - assert u1 == u2 - - -def test_freight(): - u1 = UnitField("kilogram * km") - u2 = UnitField("tkm") - assert u1.conversion_factor(u2) == 1e-3 - - -def test_vehicular_travel(): - u1 = UnitField("vehicle * m") - u2 = UnitField("vkm") - assert u1.conversion_factor(u2) == 1e-3 - - -def test_person_travel(): - u1 = UnitField("person * m") - u2 = UnitField("pkm") - assert u1.conversion_factor(u2) == 1e-3 - - -def test_conversion_factor(): - u1 = UnitField("mg") - u2 = UnitField("kg") - actual = u1.conversion_factor(u2) - assert actual == 1e-06 - - -def test_nan_conversion_factor(): - u1 = UnitField("bq") - u2 = UnitField("kg") - actual = u1.conversion_factor(u2) - assert math.isnan(actual) - - -def test_complex_conversions(): - u1 = UnitField("square_meter_year / t") - u2 = UnitField("(meter ** 2 * month) / kg") - assert u1.conversion_factor(u2) == 0.012 diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 0000000..828322c --- /dev/null +++ b/tests/unit/__init__.py @@ -0,0 +1 @@ +"""Unit tests for flowmapper using mocks.""" diff --git a/tests/unit/domain/__init__.py b/tests/unit/domain/__init__.py new file mode 100644 index 0000000..9b513de --- /dev/null +++ b/tests/unit/domain/__init__.py @@ -0,0 +1 @@ +"""Unit tests for domain entities.""" diff --git a/tests/unit/domain/test_flow.py b/tests/unit/domain/test_flow.py new file mode 100644 index 0000000..64e3c42 --- /dev/null +++ b/tests/unit/domain/test_flow.py @@ -0,0 +1,733 @@ +import pytest + +from flowmapper.domain.flow import Flow + + +class TestFlowRepr: + """Test Flow __repr__ method.""" + + def test_repr_basic_flow(self): + """Test Flow __repr__ with only required fields.""" + flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + result = repr(flow) + assert "Flow(" in result, "Expected 'Flow(' in repr" + assert "name=" in result, "Expected 'name=' in repr" + assert "unit=" in result, "Expected 'unit=' in repr" + assert "context=" in result, "Expected 'context=' in repr" + assert ( + "Carbon dioxide" in result or "carbon dioxide" in result + ), "Expected name in repr" + + def test_repr_with_identifier(self): + """Test Flow __repr__ with identifier.""" + flow = Flow.from_dict( + { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "identifier": "test-id-123", + } + ) + result = repr(flow) + assert "identifier=" in result, "Expected 'identifier=' in repr" + assert "test-id-123" in result, "Expected identifier value in repr" + + def test_repr_with_location(self): + """Test Flow __repr__ with location.""" + flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg", "location": "US"} + ) + result = repr(flow) + assert "location=" in result, "Expected 'location=' in repr" + assert "US" in result, "Expected location value in repr" + + def test_repr_with_cas_number(self): + """Test Flow __repr__ with CAS number.""" + flow = Flow.from_dict( + { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "cas_number": "000124-38-9", + } + ) + result = repr(flow) + assert "cas_number=" in result, "Expected 'cas_number=' in repr" + # CAS number is normalized, so check for normalized format + assert ( + "124-38-9" in result or "000124-38-9" in result + ), "Expected CAS number in repr" + + def test_repr_with_synonyms(self): + """Test Flow __repr__ with synonyms.""" + flow = Flow.from_dict( + { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "synonyms": ["CO2", "carbon dioxide"], + } + ) + result = repr(flow) + assert "synonyms=" in result, "Expected 'synonyms=' in repr" + assert "CO2" in result, "Expected synonym in repr" + + def test_repr_with_all_fields(self): + """Test Flow __repr__ with all optional fields.""" + flow = Flow.from_dict( + { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "identifier": "test-id", + "location": "US", + "cas_number": "000124-38-9", + "synonyms": ["CO2"], + "conversion_factor": 2.5, + } + ) + result = repr(flow) + assert "name=" in result, "Expected 'name=' in repr" + assert "unit=" in result, "Expected 'unit=' in repr" + assert "context=" in result, "Expected 'context=' in repr" + assert "identifier=" in result, "Expected 'identifier=' in repr" + assert "location=" in result, "Expected 'location=' in repr" + assert "cas_number=" in result, "Expected 'cas_number=' in repr" + assert "synonyms=" in result, "Expected 'synonyms=' in repr" + assert "conversion_factor=" in result, "Expected 'conversion_factor=' in repr" + + def test_repr_without_optional_fields(self): + """Test Flow __repr__ without optional fields (should not include them).""" + flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + result = repr(flow) + assert ( + "identifier=" not in result + ), "Expected 'identifier=' not in repr when None" + assert "location=" not in result, "Expected 'location=' not in repr when None" + assert ( + "cas_number=" not in result + ), "Expected 'cas_number=' not in repr when None" + assert "synonyms=" not in result, "Expected 'synonyms=' not in repr when empty" + assert ( + "conversion_factor=" not in result + ), "Expected 'conversion_factor=' not in repr when None" + + def test_repr_with_empty_synonyms(self): + """Test Flow __repr__ with empty synonyms list (should not include).""" + flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg", "synonyms": []} + ) + result = repr(flow) + assert ( + "synonyms=" not in result + ), "Expected 'synonyms=' not in repr when empty list" + + def test_repr_with_oxidation_state(self): + """Test Flow __repr__ with oxidation state.""" + flow = Flow.from_dict( + { + "name": "Iron(II) oxide", + "context": "air", + "unit": "kg", + } + ) + # Oxidation state is extracted during normalization, but we can set it directly + + # Create a flow with oxidation state + normalized = flow.normalize() + result = repr(normalized) + # Oxidation state might be extracted from name, check if it's in repr + # The repr will show it if it's not None + if normalized.oxidation_state is not None: + assert "oxidation_state=" in result, "Expected 'oxidation_state=' in repr" + + +class TestFlowCopyWithNewLocation: + """Test Flow copy_with_new_location method.""" + + def test_copy_with_new_location_basic(self): + """Test copy_with_new_location with simple location replacement.""" + flow = Flow.from_dict({"name": "Ammonia, NL", "context": "air", "unit": "kg"}) + new_flow = flow.copy_with_new_location("DE") + + assert new_flow.name.data == "Ammonia, DE", "Expected name to have new location" + assert new_flow.context == flow.context, "Expected context to be preserved" + assert new_flow.unit == flow.unit, "Expected unit to be preserved" + assert new_flow._id != flow._id, "Expected new Flow instance with different _id" + + def test_copy_with_new_location_preserves_attributes(self): + """Test copy_with_new_location preserves all other attributes except identifier.""" + flow = Flow.from_dict( + { + "name": "Ammonia, NL", + "context": "air", + "unit": "kg", + "identifier": "test-id-123", + "location": "US", + "cas_number": "0007664-41-7", + "synonyms": ["NH3"], + } + ) + new_flow = flow.copy_with_new_location("DE") + + assert ( + new_flow.identifier != flow.identifier + ), "Expected identifier to be a new UUID, not preserved" + assert new_flow.identifier is not None, "Expected identifier to be set" + assert ( + new_flow.cas_number == flow.cas_number + ), "Expected cas_number to be preserved" + assert new_flow.synonyms == flow.synonyms, "Expected synonyms to be preserved" + assert new_flow.context == flow.context, "Expected context to be preserved" + assert new_flow.unit == flow.unit, "Expected unit to be preserved" + + def test_copy_with_new_location_multiple_commas(self): + """Test copy_with_new_location with multiple commas in name.""" + flow = Flow.from_dict( + {"name": "Ammonia, pure, NL", "context": "air", "unit": "kg"} + ) + new_flow = flow.copy_with_new_location("FR") + + assert ( + new_flow.name.data == "Ammonia, pure, FR" + ), "Expected location at end to be replaced" + + def test_copy_with_new_location_complex_location(self): + """Test copy_with_new_location with complex location codes.""" + flow = Flow.from_dict( + {"name": "Ammonia, RER w/o DE+NL+NO", "context": "air", "unit": "kg"} + ) + new_flow = flow.copy_with_new_location("GLO") + + assert ( + new_flow.name.data == "Ammonia, GLO" + ), "Expected complex location to be replaced with simple one" + + def test_copy_with_new_location_simple_to_complex(self): + """Test copy_with_new_location replacing simple location with complex one.""" + flow = Flow.from_dict({"name": "Ammonia, NL", "context": "air", "unit": "kg"}) + new_flow = flow.copy_with_new_location("RER w/o DE+NL+NO") + + assert ( + new_flow.name.data == "Ammonia, RER w/o DE+NL+NO" + ), "Expected simple location to be replaced with complex one" + + def test_copy_with_new_location_appends_when_no_location_suffix(self): + """Test copy_with_new_location appends location when no location suffix exists.""" + flow = Flow.from_dict({"name": "Ammonia", "context": "air", "unit": "kg"}) + new_flow = flow.copy_with_new_location("DE") + + assert new_flow.name.data == "Ammonia, DE", "Expected location to be appended" + assert new_flow.identifier != flow.identifier, "Expected new identifier" + + def test_copy_with_new_location_appends_with_dash_location(self): + """Test copy_with_new_location appends location when dash-separated location exists.""" + flow = Flow.from_dict({"name": "Ammonia-NL", "context": "air", "unit": "kg"}) + new_flow = flow.copy_with_new_location("DE") + + assert ( + new_flow.name.data == "Ammonia-NL, DE" + ), "Expected location to be appended" + assert new_flow.identifier != flow.identifier, "Expected new identifier" + + def test_copy_with_new_location_appends_when_location_in_middle(self): + """Test copy_with_new_location appends location when location not at end.""" + flow = Flow.from_dict( + {"name": "Ammonia, NL, pure", "context": "air", "unit": "kg"} + ) + new_flow = flow.copy_with_new_location("DE") + + assert ( + new_flow.name.data == "Ammonia, NL, pure, DE" + ), "Expected location to be appended" + assert new_flow.identifier != flow.identifier, "Expected new identifier" + + def test_copy_with_new_location_various_locations(self): + """Test copy_with_new_location with various location codes.""" + test_cases = [ + ("Water, DE", "FR", "Water, FR"), + ("Water, FR", "US", "Water, US"), + ("Water, US", "GLO", "Water, GLO"), + ("Water, GLO", "DE", "Water, DE"), + ] + + for name, new_location, expected_name in test_cases: + flow = Flow.from_dict({"name": name, "context": "air", "unit": "kg"}) + new_flow = flow.copy_with_new_location(new_location) + assert ( + new_flow.name.data == expected_name + ), f"Expected '{expected_name}' for '{name}' -> '{new_location}', but got {new_flow.name.data!r}" + + def test_copy_with_new_location_only_location_code(self): + """Test copy_with_new_location with only location code in name.""" + flow = Flow.from_dict({"name": ", NL", "context": "air", "unit": "kg"}) + new_flow = flow.copy_with_new_location("DE") + + assert new_flow.name.data == ", DE", "Expected location to be replaced" + + def test_copy_with_new_location_with_trailing_whitespace(self): + """Test copy_with_new_location preserves trailing whitespace.""" + flow = Flow.from_dict({"name": "Ammonia, NL ", "context": "air", "unit": "kg"}) + new_flow = flow.copy_with_new_location("DE") + + assert ( + new_flow.name.data == "Ammonia, DE " + ), "Expected trailing whitespace to be preserved" + + def test_copy_with_new_location_creates_new_instance(self): + """Test copy_with_new_location creates a new Flow instance.""" + flow = Flow.from_dict({"name": "Ammonia, NL", "context": "air", "unit": "kg"}) + new_flow = flow.copy_with_new_location("DE") + + assert new_flow is not flow, "Expected new Flow instance" + assert new_flow._id != flow._id, "Expected different _id" + + def test_copy_with_new_location_original_unchanged(self): + """Test copy_with_new_location does not modify original flow.""" + flow = Flow.from_dict({"name": "Ammonia, NL", "context": "air", "unit": "kg"}) + original_name = flow.name.data + + new_flow = flow.copy_with_new_location("DE") + + assert ( + flow.name.data == original_name + ), "Expected original flow name to be unchanged" + assert ( + new_flow.name.data != original_name + ), "Expected new flow name to be different" + + def test_copy_with_new_location_with_all_fields(self): + """Test copy_with_new_location with flow containing all fields.""" + flow = Flow.from_dict( + { + "name": "Carbon dioxide, NL", + "context": ("Raw", "(unspecified)"), + "unit": "kg", + "identifier": "test-id-123", + "cas_number": "000124-38-9", + "synonyms": ["CO2"], + } + ) + new_flow = flow.copy_with_new_location("DE") + + # Check name is updated + assert ( + new_flow.name.data == "Carbon dioxide, DE" + ), "Expected name to have new location" + # Check all other fields are preserved except identifier + assert ( + new_flow.identifier != flow.identifier + ), "Expected identifier to be a new UUID, not preserved" + assert new_flow.identifier is not None, "Expected identifier to be set" + assert new_flow.context == flow.context, "Expected context preserved" + assert new_flow.unit == flow.unit, "Expected unit preserved" + assert new_flow.cas_number == flow.cas_number, "Expected cas_number preserved" + assert new_flow.synonyms == flow.synonyms, "Expected synonyms preserved" + + def test_copy_with_new_location_raises_value_error_empty_location(self): + """Test copy_with_new_location raises ValueError when location parameter is empty.""" + flow = Flow.from_dict({"name": "Ammonia, NL", "context": "air", "unit": "kg"}) + + with pytest.raises(ValueError, match="No location parameter given"): + flow.copy_with_new_location("") + + with pytest.raises(ValueError, match="No location parameter given"): + flow.copy_with_new_location(None) + + def test_copy_with_new_location_sets_new_identifier(self): + """Test copy_with_new_location sets a new UUID identifier.""" + import uuid + + flow = Flow.from_dict( + { + "name": "Ammonia, NL", + "context": "air", + "unit": "kg", + "identifier": "test-id-123", + } + ) + new_flow = flow.copy_with_new_location("DE") + + # Verify identifier is different + assert ( + new_flow.identifier != flow.identifier + ), "Expected identifier to be different from original" + assert new_flow.identifier is not None, "Expected identifier to be set" + # Verify it's a valid UUID format + try: + uuid.UUID(new_flow.identifier) + except ValueError: + pytest.fail( + f"Expected identifier to be a valid UUID, but got {new_flow.identifier!r}" + ) + + def test_copy_with_new_location_identifier_when_none(self): + """Test copy_with_new_location sets identifier even when original is None.""" + import uuid + + flow = Flow.from_dict({"name": "Ammonia, NL", "context": "air", "unit": "kg"}) + assert flow.identifier is None, "Expected original identifier to be None" + + new_flow = flow.copy_with_new_location("DE") + + # Verify identifier is set even when original was None + assert ( + new_flow.identifier is not None + ), "Expected identifier to be set even when original was None" + # Verify it's a valid UUID format + try: + uuid.UUID(new_flow.identifier) + except ValueError: + pytest.fail( + f"Expected identifier to be a valid UUID, but got {new_flow.identifier!r}" + ) + + +class TestFlowToDict: + """Test Flow to_dict method.""" + + def test_to_dict_with_all_fields(self): + """Test to_dict with all fields populated.""" + flow = Flow.from_dict( + { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "identifier": "test-id-123", + "location": "NL", + "cas_number": "000124-38-9", + "synonyms": ["CO2", "Carbon dioxide"], + "conversion_factor": 2.5, + } + ) + result = flow.to_dict() + + assert result["name"] == "Carbon dioxide", "Expected name in dict" + assert result["unit"] == "kg", "Expected unit in dict" + # Context as_tuple() returns string if value is string, tuple if list/tuple + assert result["context"] == "air", "Expected context as string (from as_tuple)" + assert result["identifier"] == "test-id-123", "Expected identifier in dict" + assert result["location"] == "NL", "Expected location in dict" + assert result["cas_number"] == flow.cas_number, "Expected cas_number in dict" + assert result["synonyms"] == [ + "CO2", + "Carbon dioxide", + ], "Expected synonyms in dict" + assert result["conversion_factor"] == 2.5, "Expected conversion_factor in dict" + + def test_to_dict_with_only_required_fields(self): + """Test to_dict with only required fields.""" + flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + result = flow.to_dict() + + assert result["name"] == "Carbon dioxide", "Expected name in dict" + assert result["unit"] == "kg", "Expected unit in dict" + # Context as_tuple() returns string if value is string + assert result["context"] == "air", "Expected context as string (from as_tuple)" + assert result["identifier"] is None, "Expected identifier to be None" + assert "location" not in result, "Expected location not in dict when None" + assert "cas_number" not in result, "Expected cas_number not in dict when None" + assert "synonyms" not in result, "Expected synonyms not in dict when empty" + assert ( + "conversion_factor" not in result + ), "Expected conversion_factor not in dict when None" + + def test_to_dict_excludes_none_optional_fields(self): + """Test to_dict excludes None optional fields.""" + flow = Flow.from_dict( + { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "identifier": None, + } + ) + result = flow.to_dict() + + assert "location" not in result, "Expected location not in dict when None" + assert ( + "oxidation_state" not in result + ), "Expected oxidation_state not in dict when None" + assert "cas_number" not in result, "Expected cas_number not in dict when None" + assert "synonyms" not in result, "Expected synonyms not in dict when empty" + assert ( + "conversion_factor" not in result + ), "Expected conversion_factor not in dict when None" + + def test_to_dict_excludes_empty_synonyms(self): + """Test to_dict excludes empty synonyms list.""" + flow = Flow.from_dict( + { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "synonyms": [], + } + ) + result = flow.to_dict() + + assert "synonyms" not in result, "Expected empty synonyms not in dict" + + def test_to_dict_context_as_tuple(self): + """Test to_dict converts context to tuple format.""" + flow = Flow.from_dict( + { + "name": "Carbon dioxide", + "context": ["Raw", "(unspecified)"], + "unit": "kg", + } + ) + result = flow.to_dict() + + # When context is a list, as_tuple() returns a tuple (not normalized) + assert isinstance(result["context"], tuple), "Expected context to be tuple" + assert result["context"] == ( + "Raw", + "(unspecified)", + ), "Expected context tuple (not normalized in to_dict)" + + +class TestFlowRandonneurMapping: + """Test Flow randonneur_mapping static method.""" + + def test_randonneur_mapping_returns_dict(self): + """Test randonneur_mapping returns dictionary structure.""" + result = Flow.randonneur_mapping() + + assert isinstance(result, dict), "Expected dict return type" + assert "expression language" in result, "Expected expression language key" + assert "labels" in result, "Expected labels key" + + def test_randonneur_mapping_expression_language(self): + """Test randonneur_mapping has correct expression language.""" + result = Flow.randonneur_mapping() + + assert ( + result["expression language"] == "JSONPath" + ), "Expected JSONPath expression language" + + def test_randonneur_mapping_all_attributes_mapped(self): + """Test randonneur_mapping includes all Flow attributes.""" + result = Flow.randonneur_mapping() + labels = result["labels"] + + assert "unit" in labels, "Expected unit mapping" + assert "name" in labels, "Expected name mapping" + assert "context" in labels, "Expected context mapping" + assert "identifier" in labels, "Expected identifier mapping" + assert "location" in labels, "Expected location mapping" + assert "cas_number" in labels, "Expected cas_number mapping" + assert "synonyms" in labels, "Expected synonyms mapping" + + def test_randonneur_mapping_jsonpath_expressions(self): + """Test randonneur_mapping has correct JSONPath expressions.""" + result = Flow.randonneur_mapping() + labels = result["labels"] + + assert labels["unit"] == "$.unit", "Expected unit JSONPath" + assert labels["name"] == "$.name", "Expected name JSONPath" + assert labels["context"] == "$.context", "Expected context JSONPath" + assert labels["identifier"] == "$.identifier", "Expected identifier JSONPath" + assert labels["location"] == "$.location", "Expected location JSONPath" + assert labels["cas_number"] == "$.cas_number", "Expected cas_number JSONPath" + assert labels["synonyms"] == "$.synonyms", "Expected synonyms JSONPath" + assert "conversion_factor" in labels, "Expected conversion_factor mapping" + assert ( + labels["conversion_factor"] == "$.conversion_factor" + ), "Expected conversion_factor JSONPath" + + +class TestFlowEquality: + """Test Flow __eq__ method.""" + + def test_eq_same_instance(self): + """Test equality with same instance.""" + flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + assert flow == flow, "Expected flow to equal itself" + + def test_eq_different_instances_same_data(self): + """Test different flows with same data are not equal (different _id).""" + flow1 = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + flow2 = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + assert flow1 != flow2, "Expected flows with different _id to not be equal" + + def test_eq_different_objects(self): + """Test equality with non-Flow objects returns False.""" + flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + assert flow != "not a flow", "Expected flow to not equal string" + assert flow != 123, "Expected flow to not equal number" + assert flow != None, "Expected flow to not equal None" # noqa: E711 + + +class TestFlowComparison: + """Test Flow __lt__ method.""" + + def test_lt_sorts_by_name(self): + """Test sorting by name.""" + flow1 = Flow.from_dict({"name": "Ammonia", "context": "air", "unit": "kg"}) + flow2 = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + assert flow1 < flow2, "Expected Ammonia < Carbon dioxide" + assert not (flow2 < flow1), "Expected Carbon dioxide not < Ammonia" + + def test_lt_sorts_by_unit_when_names_equal(self): + """Test sorting by unit when names are equal.""" + flow1 = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "g"} + ) + flow2 = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + assert flow1 < flow2, "Expected g < kg when names are equal" + + def test_lt_sorts_by_context_when_name_and_unit_equal(self): + """Test sorting by context when name and unit are equal.""" + flow1 = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + flow2 = Flow.from_dict( + {"name": "Carbon dioxide", "context": "water", "unit": "kg"} + ) + + assert flow1 < flow2, "Expected air < water when name and unit are equal" + + def test_lt_sorts_by_identifier_when_other_fields_equal(self): + """Test sorting by identifier when other fields are equal.""" + flow1 = Flow.from_dict( + { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "identifier": "id1", + } + ) + flow2 = Flow.from_dict( + { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "identifier": "id2", + } + ) + + assert flow1 < flow2, "Expected id1 < id2 when other fields are equal" + + def test_lt_with_non_flow_object(self): + """Test comparison with non-Flow objects.""" + flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + # __lt__ should return False for non-Flow objects + result = flow < "not a flow" + assert result is False, "Expected __lt__ to return False for non-Flow objects" + + +class TestFlowConversionFactor: + """Test Flow conversion_factor attribute.""" + + def test_conversion_factor_from_dict(self): + """Test conversion_factor can be set via from_dict.""" + flow = Flow.from_dict( + { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "conversion_factor": 2.5, + } + ) + assert flow.conversion_factor == 2.5, "Expected conversion_factor to be set" + + def test_conversion_factor_none_by_default(self): + """Test conversion_factor is None by default.""" + flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + assert ( + flow.conversion_factor is None + ), "Expected conversion_factor to be None by default" + + def test_conversion_factor_preserved_in_normalize(self): + """Test conversion_factor is preserved during normalization.""" + flow = Flow.from_dict( + { + "name": "Carbon dioxide, NL", + "context": "air", + "unit": "kg", + "conversion_factor": 3.0, + } + ) + normalized = flow.normalize() + assert ( + normalized.conversion_factor == 3.0 + ), "Expected conversion_factor to be preserved in normalize" + + def test_conversion_factor_in_to_dict_when_present(self): + """Test conversion_factor included in to_dict when present.""" + flow = Flow.from_dict( + { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "conversion_factor": 1.5, + } + ) + result = flow.to_dict() + assert "conversion_factor" in result, "Expected conversion_factor in dict" + assert ( + result["conversion_factor"] == 1.5 + ), "Expected conversion_factor value in dict" + + def test_conversion_factor_not_in_to_dict_when_none(self): + """Test conversion_factor excluded from to_dict when None.""" + flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + result = flow.to_dict() + assert ( + "conversion_factor" not in result + ), "Expected conversion_factor not in dict when None" + + def test_conversion_factor_in_repr_when_present(self): + """Test conversion_factor included in __repr__ when present.""" + flow = Flow.from_dict( + { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "conversion_factor": 2.0, + } + ) + result = repr(flow) + assert "conversion_factor=" in result, "Expected conversion_factor in repr" + assert "2.0" in result, "Expected conversion_factor value in repr" + + def test_conversion_factor_not_in_repr_when_none(self): + """Test conversion_factor excluded from __repr__ when None.""" + flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + result = repr(flow) + assert ( + "conversion_factor=" not in result + ), "Expected conversion_factor not in repr when None" diff --git a/tests/unit/domain/test_match.py b/tests/unit/domain/test_match.py new file mode 100644 index 0000000..d7071dd --- /dev/null +++ b/tests/unit/domain/test_match.py @@ -0,0 +1,452 @@ +"""Unit tests for Match class.""" + +from copy import copy + +import pytest + +from flowmapper.domain.flow import Flow +from flowmapper.domain.match import Match +from flowmapper.domain.match_condition import MatchCondition + + +class TestMatchInitialization: + """Test Match class initialization.""" + + def test_match_initialization_with_required_fields(self): + """Test Match initialization with only required fields.""" + source_flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + target_flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + match = Match( + source=source_flow, + target=target_flow, + function_name="test_function", + condition=MatchCondition.exact, + ) + + assert match.source == source_flow, "Expected source to match" + assert match.target == target_flow, "Expected target to match" + assert match.function_name == "test_function", "Expected function_name to match" + assert match.condition == MatchCondition.exact, "Expected condition to match" + assert match.conversion_factor == 1.0, "Expected default conversion_factor" + assert match.comment == "", "Expected default empty comment" + assert ( + match.new_target_flow is False + ), "Expected default new_target_flow to be False" + + def test_match_initialization_with_all_fields(self): + """Test Match initialization with all fields including new_target_flow.""" + source_flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + target_flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + match = Match( + source=source_flow, + target=target_flow, + function_name="test_function", + condition=MatchCondition.related, + conversion_factor=2.5, + comment="Test comment", + new_target_flow=True, + ) + + assert match.source == source_flow, "Expected source to match" + assert match.target == target_flow, "Expected target to match" + assert match.function_name == "test_function", "Expected function_name to match" + assert match.condition == MatchCondition.related, "Expected condition to match" + assert match.conversion_factor == 2.5, "Expected conversion_factor to match" + assert match.comment == "Test comment", "Expected comment to match" + assert match.new_target_flow is True, "Expected new_target_flow to be True" + + def test_match_initialization_with_new_target_flow_false(self): + """Test Match initialization with new_target_flow explicitly set to False.""" + source_flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + target_flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + match = Match( + source=source_flow, + target=target_flow, + function_name="test_function", + condition=MatchCondition.exact, + new_target_flow=False, + ) + + assert match.new_target_flow is False, "Expected new_target_flow to be False" + + def test_match_initialization_with_different_conditions(self): + """Test Match initialization with different MatchCondition values.""" + source_flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + target_flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + for condition in MatchCondition: + match = Match( + source=source_flow, + target=target_flow, + function_name="test_function", + condition=condition, + new_target_flow=True, + ) + assert match.condition == condition, f"Expected condition to be {condition}" + assert match.new_target_flow is True, "Expected new_target_flow to be True" + + +class TestMatchExport: + """Test Match export method.""" + + def test_export_basic(self): + """Test basic export without metadata.""" + source_flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + target_flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + match = Match( + source=source_flow, + target=target_flow, + function_name="test_function", + condition=MatchCondition.exact, + new_target_flow=True, + ) + + exported = match.export() + + assert "source" in exported, "Expected source in exported data" + assert "target" in exported, "Expected target in exported data" + # Export uses the original flow data (not normalized) + assert ( + exported["source"]["name"] == "Carbon dioxide" + ), "Expected source name in export" + assert ( + exported["target"]["name"] == "Carbon dioxide" + ), "Expected target name in export" + # Condition is exported as SKOS URI via as_glad() method + assert ( + exported["condition"] == "http://www.w3.org/2004/02/skos/core#exactMatch" + ), "Expected condition as SKOS URI" + assert exported["conversion_factor"] == 1.0, "Expected conversion_factor" + assert exported["comment"] == "", "Expected comment" + assert exported["new_target_flow"] is True, "Expected new_target_flow in export" + assert "function_name" not in exported, "Expected function_name to be removed" + + def test_export_with_metadata(self): + """Test export with flowmapper_metadata enabled.""" + source_flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + target_flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + match = Match( + source=source_flow, + target=target_flow, + function_name="test_function", + condition=MatchCondition.close, + new_target_flow=False, + ) + + exported = match.export(flowmapper_metadata=True) + + assert ( + "flowmapper_metadata" in exported + ), "Expected flowmapper_metadata in export" + assert exported["flowmapper_metadata"]["function_name"] == "test_function" + assert "version" in exported["flowmapper_metadata"] + assert ( + exported["new_target_flow"] is False + ), "Expected new_target_flow in export" + + def test_export_with_new_target_flow(self): + """Test export includes new_target_flow attribute.""" + source_flow = Flow.from_dict( + {"name": "Water", "context": "water", "unit": "kg"} + ) + target_flow = Flow.from_dict( + {"name": "Water", "context": "water", "unit": "kg"} + ) + + match = Match( + source=source_flow, + target=target_flow, + function_name="test_function", + condition=MatchCondition.related, + new_target_flow=True, + comment="New target flow", + ) + + exported = match.export() + + assert ( + exported["new_target_flow"] is True + ), "Expected new_target_flow to be True in export" + assert ( + exported["comment"] == "New target flow" + ), "Expected comment to be preserved" + + +class TestMatchComparison: + """Test Match comparison methods.""" + + def test_match_less_than_comparison(self): + """Test Match __lt__ method for sorting.""" + source1 = Flow.from_dict({"name": "A", "context": "air", "unit": "kg"}) + target1 = Flow.from_dict({"name": "B", "context": "air", "unit": "kg"}) + source2 = Flow.from_dict({"name": "C", "context": "air", "unit": "kg"}) + target2 = Flow.from_dict({"name": "D", "context": "air", "unit": "kg"}) + + match1 = Match( + source=source1, + target=target1, + function_name="test", + condition=MatchCondition.exact, + new_target_flow=True, + ) + match2 = Match( + source=source2, + target=target2, + function_name="test", + condition=MatchCondition.exact, + new_target_flow=False, + ) + + assert match1 < match2, "Expected match1 to be less than match2" + assert not (match2 < match1), "Expected match2 not to be less than match1" + + def test_match_comparison_with_same_source_different_target(self): + """Test Match comparison with same source but different target.""" + source = Flow.from_dict({"name": "A", "context": "air", "unit": "kg"}) + target1 = Flow.from_dict({"name": "B", "context": "air", "unit": "kg"}) + target2 = Flow.from_dict({"name": "C", "context": "air", "unit": "kg"}) + + match1 = Match( + source=source, + target=target1, + function_name="test", + condition=MatchCondition.exact, + new_target_flow=True, + ) + match2 = Match( + source=source, + target=target2, + function_name="test", + condition=MatchCondition.exact, + new_target_flow=False, + ) + + assert ( + match1 < match2 + ), "Expected match1 to be less than match2 based on target name" + + def test_match_comparison_new_target_flow_does_not_affect_sorting(self): + """Test that new_target_flow does not affect comparison.""" + source1 = Flow.from_dict({"name": "A", "context": "air", "unit": "kg"}) + target1 = Flow.from_dict({"name": "B", "context": "air", "unit": "kg"}) + source2 = Flow.from_dict({"name": "C", "context": "air", "unit": "kg"}) + target2 = Flow.from_dict({"name": "D", "context": "air", "unit": "kg"}) + + match1 = Match( + source=source1, + target=target1, + function_name="test", + condition=MatchCondition.exact, + new_target_flow=True, + ) + match2 = Match( + source=source2, + target=target2, + function_name="test", + condition=MatchCondition.exact, + new_target_flow=False, + ) + + # Comparison should be based on source/target names, not new_target_flow + assert ( + match1 < match2 + ), "Expected comparison based on names, not new_target_flow" + + +class TestMatchWithComplexFlows: + """Test Match with complex flow data.""" + + def test_match_with_all_flow_fields(self): + """Test Match with flows containing all possible fields.""" + source_flow = Flow.from_dict( + { + "name": "Carbon dioxide, in air", + "context": ["Emissions", "to air"], + "unit": "kg", + "identifier": "source-id", + "location": "US", + "cas_number": "000124-38-9", + "synonyms": ["CO2"], + } + ) + target_flow = Flow.from_dict( + { + "name": "Carbon dioxide", + "context": ["Emissions", "to air"], + "unit": "kg", + "identifier": "target-id", + "location": "CA", + "cas_number": "124-38-9", + } + ) + + match = Match( + source=source_flow, + target=target_flow, + function_name="test_function", + condition=MatchCondition.close, + conversion_factor=1.5, + comment="Complex match", + new_target_flow=True, + ) + + assert match.source == source_flow + assert match.target == target_flow + assert match.new_target_flow is True + + exported = match.export() + assert exported["new_target_flow"] is True + assert exported["conversion_factor"] == 1.5 + assert exported["comment"] == "Complex match" + + +class TestMatchExportEdgeCases: + """Test Match export edge cases.""" + + def test_export_excludes_private_attributes(self): + """Test export excludes _id and other private attributes.""" + source_flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + target_flow = Flow.from_dict({"name": "CO2", "context": "air", "unit": "kg"}) + + match = Match( + source=source_flow, + target=target_flow, + function_name="test_function", + condition=MatchCondition.exact, + ) + + exported = match.export() + + # Check source and target don't have _id + assert "_id" not in exported["source"], "Expected _id not in exported source" + assert "_id" not in exported["target"], "Expected _id not in exported target" + + def test_export_with_flowmapper_metadata_true(self): + """Test export with flowmapper_metadata=True includes version.""" + source_flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + target_flow = Flow.from_dict({"name": "CO2", "context": "air", "unit": "kg"}) + + match = Match( + source=source_flow, + target=target_flow, + function_name="test_function", + condition=MatchCondition.exact, + ) + + exported = match.export(flowmapper_metadata=True) + + assert ( + "flowmapper_metadata" in exported + ), "Expected flowmapper_metadata in export" + assert ( + "version" in exported["flowmapper_metadata"] + ), "Expected version in metadata" + assert ( + "function_name" in exported["flowmapper_metadata"] + ), "Expected function_name in metadata" + assert ( + exported["flowmapper_metadata"]["function_name"] == "test_function" + ), "Expected function_name to match" + + def test_export_with_flowmapper_metadata_false(self): + """Test export with flowmapper_metadata=False excludes metadata.""" + source_flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + target_flow = Flow.from_dict({"name": "CO2", "context": "air", "unit": "kg"}) + + match = Match( + source=source_flow, + target=target_flow, + function_name="test_function", + condition=MatchCondition.exact, + ) + + exported = match.export(flowmapper_metadata=False) + + assert ( + "flowmapper_metadata" not in exported + ), "Expected flowmapper_metadata not in export" + + def test_export_serializes_userstring_objects(self): + """Test export serializes UserString objects in source/target.""" + from flowmapper.fields import StringField + + source_flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + target_flow = Flow.from_dict({"name": "CO2", "context": "air", "unit": "kg"}) + + match = Match( + source=source_flow, + target=target_flow, + function_name="test_function", + condition=MatchCondition.exact, + ) + + exported = match.export() + + # StringField is a UserString subclass, should be serialized to string + assert isinstance( + exported["source"]["name"], str + ), "Expected name to be string, not UserString" + assert isinstance( + exported["target"]["name"], str + ), "Expected name to be string, not UserString" + + def test_export_serializes_contextfield_objects(self): + """Test export serializes ContextField objects.""" + source_flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": ["air", "unspecified"], "unit": "kg"} + ) + target_flow = Flow.from_dict({"name": "CO2", "context": "air", "unit": "kg"}) + + match = Match( + source=source_flow, + target=target_flow, + function_name="test_function", + condition=MatchCondition.exact, + ) + + exported = match.export() + + # ContextField should be serialized to its value + assert isinstance( + exported["source"]["context"], (str, tuple, list) + ), "Expected context to be serialized" + assert not hasattr( + exported["source"]["context"], "value" + ), "Expected context not to be ContextField object" diff --git a/tests/unit/domain/test_match_condition.py b/tests/unit/domain/test_match_condition.py new file mode 100644 index 0000000..cde514b --- /dev/null +++ b/tests/unit/domain/test_match_condition.py @@ -0,0 +1,81 @@ +"""Unit tests for MatchCondition enum.""" + +import pytest + +from flowmapper.domain.match_condition import MatchCondition + + +class TestMatchConditionAsGlad: + """Test MatchCondition as_glad method.""" + + def test_exact_match_returns_equals(self): + """Test exact match returns '='.""" + assert ( + MatchCondition.exact.as_glad() == "=" + ), "Expected exact match to return '='" + + def test_close_match_returns_tilde(self): + """Test close match returns '~'.""" + assert ( + MatchCondition.close.as_glad() == "~" + ), "Expected close match to return '~'" + + def test_related_match_returns_tilde(self): + """Test related match returns '~'.""" + assert ( + MatchCondition.related.as_glad() == "~" + ), "Expected related match to return '~'" + + def test_narrow_match_returns_greater_than(self): + """Test narrow match returns '>'.""" + assert ( + MatchCondition.narrow.as_glad() == ">" + ), "Expected narrow match to return '>'" + + def test_broad_match_returns_less_than(self): + """Test broad match returns '<'.""" + assert ( + MatchCondition.broad.as_glad() == "<" + ), "Expected broad match to return '<'" + + def test_all_enum_values_have_glad_symbols(self): + """Test all enum values have corresponding GLAD symbols.""" + glad_symbols = {condition.as_glad() for condition in MatchCondition} + + assert "=" in glad_symbols, "Expected '=' symbol for exact match" + assert "~" in glad_symbols, "Expected '~' symbol for close/related match" + assert ">" in glad_symbols, "Expected '>' symbol for narrow match" + assert "<" in glad_symbols, "Expected '<' symbol for broad match" + + +class TestMatchConditionEnumValues: + """Test MatchCondition enum values.""" + + def test_all_values_are_valid_skos_uris(self): + """Test all enum values are valid SKOS URIs.""" + skos_base = "http://www.w3.org/2004/02/skos/core#" + + for condition in MatchCondition: + assert condition.value.startswith( + skos_base + ), f"Expected {condition.name} to be SKOS URI" + assert "#" in condition.value, f"Expected {condition.value} to contain '#'" + + def test_enum_can_be_used_in_comparisons(self): + """Test enum can be used in comparisons.""" + assert MatchCondition.exact == MatchCondition.exact, "Expected exact == exact" + assert MatchCondition.exact != MatchCondition.close, "Expected exact != close" + assert MatchCondition.exact in [ + MatchCondition.exact, + MatchCondition.close, + ], "Expected exact in list" + + def test_enum_string_representation(self): + """Test enum string representation.""" + assert ( + str(MatchCondition.exact) == MatchCondition.exact.value + ), "Expected str() to return value" + assert ( + repr(MatchCondition.exact) + == f"" + ), "Expected repr() to show enum name and value" diff --git a/tests/unit/domain/test_normalized_flow.py b/tests/unit/domain/test_normalized_flow.py new file mode 100644 index 0000000..83df1b8 --- /dev/null +++ b/tests/unit/domain/test_normalized_flow.py @@ -0,0 +1,987 @@ +"""Unit tests for NormalizedFlow class.""" + +from copy import copy + +import pytest + +from flowmapper.domain.flow import Flow +from flowmapper.domain.normalized_flow import NormalizedFlow + + +class TestNormalizedFlowResetCurrent: + """Test NormalizedFlow reset_current method.""" + + def test_reset_current_resets_to_normalized(self): + """Test reset_current resets current to normalized flow.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + # Modify current + nf.update_current(name="Modified name") + assert ( + nf.current.name.data != normalized.name.data + ), "Expected current to be different from normalized after update" + + # Reset + nf.reset_current() + assert ( + nf.current.name.data == normalized.name.data + ), f"Expected current.name to equal normalized.name after reset, but got {nf.current.name.data!r} != {normalized.name.data!r}" + assert ( + nf.current.unit.data == normalized.unit.data + ), f"Expected current.unit to equal normalized.unit after reset, but got {nf.current.unit.data!r} != {normalized.unit.data!r}" + assert ( + nf.current.context.value == normalized.context.value + ), f"Expected current.context to equal normalized.context after reset, but got {nf.current.context.value!r} != {normalized.context.value!r}" + + def test_reset_current_creates_new_instance(self): + """Test reset_current creates a new Flow instance.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + # Modify current + nf.update_current(name="Modified name") + old_current_id = nf.current._id + + # Reset + nf.reset_current() + assert ( + nf.current._id != old_current_id + ), "Expected reset_current to create a new Flow instance with different _id" + assert ( + nf.current is not normalized + ), "Expected reset_current to create a copy, not reference to normalized" + + def test_reset_current_preserves_normalized(self): + """Test reset_current does not modify normalized flow.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + # Modify current multiple times + nf.update_current(name="First modification") + nf.update_current(name="Second modification") + nf.update_current(unit="g") + + # Reset + nf.reset_current() + + # Check normalized is unchanged + assert ( + normalized.name.data == "carbon dioxide" + ), f"Expected normalized.name to be unchanged, but got {normalized.name.data!r}" + # Unit is normalized (kg -> kilogram), so check normalized value + assert ( + normalized.unit.data == "kilogram" + ), f"Expected normalized.unit to be unchanged, but got {normalized.unit.data!r}" + + def test_reset_current_with_complex_flow(self): + """Test reset_current with flow containing all fields.""" + data = { + "name": "Carbon dioxide, in air", + "context": ["Raw", "(unspecified)"], + "unit": "kg", + "identifier": "test-id-123", + "location": "US", + "cas_number": "000124-38-9", + "synonyms": ["CO2"], + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + # Modify multiple fields + nf.update_current(name="Modified", unit="g", location="CA") + + # Reset + nf.reset_current() + + # Verify all fields are reset + assert ( + nf.current.name.data == normalized.name.data + ), "Expected name to be reset to normalized" + assert ( + nf.current.unit.data == normalized.unit.data + ), "Expected unit to be reset to normalized" + assert ( + nf.current.location == normalized.location + ), "Expected location to be reset to normalized" + assert ( + nf.current.identifier == normalized.identifier + ), "Expected identifier to be reset to normalized" + assert ( + nf.current.cas_number == normalized.cas_number + ), "Expected cas_number to be reset to normalized" + + +class TestNormalizedFlowUpdateCurrent: + """Test NormalizedFlow update_current method.""" + + def test_update_current_with_name(self): + """Test update_current with name parameter.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + nf.update_current(name="Updated name") + assert ( + nf.current.name.data == "Updated name" + ), f"Expected current.name to be 'Updated name', but got {nf.current.name.data!r}" + assert ( + nf.current.unit.data == normalized.unit.data + ), "Expected unit to remain unchanged" + assert ( + nf.current.context.value == normalized.context.value + ), "Expected context to remain unchanged" + + def test_update_current_with_unit(self): + """Test update_current with unit parameter.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + nf.update_current(unit="g") + assert ( + nf.current.unit.data == "g" + ), f"Expected current.unit to be 'g', but got {nf.current.unit.data!r}" + assert ( + nf.current.name.data == normalized.name.data + ), "Expected name to remain unchanged" + + def test_update_current_with_context(self): + """Test update_current with context parameter.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + nf.update_current(context=["water", "unspecified"]) + assert nf.current.context.value == [ + "water", + "unspecified", + ], f"Expected current.context to be ['water', 'unspecified'], but got {nf.current.context.value!r}" + + def test_update_current_with_multiple_fields(self): + """Test update_current with multiple fields.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + nf.update_current(name="Updated name", unit="g", context="water") + assert nf.current.name.data == "Updated name", "Expected name to be updated" + assert nf.current.unit.data == "g", "Expected unit to be updated" + assert nf.current.context.value == "water", "Expected context to be updated" + + def test_update_current_with_location(self): + """Test update_current with location parameter.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "location": "US", + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + nf.update_current(location="CA") + assert ( + nf.current.location == "CA" + ), f"Expected current.location to be 'CA', but got {nf.current.location!r}" + + def test_update_current_with_identifier(self): + """Test update_current with identifier parameter.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "identifier": "original-id", + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + nf.update_current(identifier="new-id") + assert ( + nf.current.identifier == "new-id" + ), f"Expected current.identifier to be 'new-id', but got {nf.current.identifier!r}" + + def test_update_current_with_cas_number(self): + """Test update_current with cas_number parameter.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "cas_number": "000124-38-9", + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + nf.update_current(cas_number="000078-79-5") + # CAS numbers are normalized (leading zeros removed) when passed through from_string + assert ( + nf.current.cas_number.data == "78-79-5" + ), f"Expected current.cas_number to be '78-79-5' (normalized), but got {nf.current.cas_number.data!r}" + + def test_update_current_with_synonyms(self): + """Test update_current with synonyms parameter.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "synonyms": ["CO2"], + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + nf.update_current(synonyms=["CO2", "carbon dioxide"]) + assert nf.current.synonyms == [ + "CO2", + "carbon dioxide", + ], f"Expected current.synonyms to be ['CO2', 'carbon dioxide'], but got {nf.current.synonyms!r}" + + def test_update_current_creates_new_instance(self): + """Test update_current creates a new Flow instance.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + old_current_id = nf.current._id + nf.update_current(name="Updated") + assert ( + nf.current._id != old_current_id + ), "Expected update_current to create a new Flow instance with different _id" + + def test_update_current_preserves_normalized(self): + """Test update_current does not modify normalized flow.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + nf.update_current(name="Updated", unit="g") + assert ( + normalized.name.data == "carbon dioxide" + ), "Expected normalized.name to be unchanged" + # Unit is normalized (kg -> kilogram), so check normalized value + assert ( + normalized.unit.data == "kilogram" + ), f"Expected normalized.unit to be unchanged, but got {normalized.unit.data!r}" + + def test_update_current_based_on_normalized(self): + """Test update_current uses normalized as base, not current.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + # First update + nf.update_current(name="First update") + assert nf.current.name.data == "First update", "Expected first update to work" + + # Second update - should be based on normalized, not "First update" + nf.update_current(unit="g") + assert ( + nf.current.name.data == normalized.name.data + ), "Expected name to revert to normalized value when not specified in update" + assert nf.current.unit.data == "g", "Expected unit to be updated" + + def test_update_current_with_empty_synonyms(self): + """Test update_current with empty synonyms list.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "synonyms": ["CO2"], + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + nf.update_current(synonyms=[]) + assert ( + nf.current.synonyms == [] + ), f"Expected current.synonyms to be empty list, but got {nf.current.synonyms!r}" + + def test_update_current_with_none_location(self): + """Test update_current with None location.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "location": "US", + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + nf.update_current(location=None) + assert ( + nf.current.location is None + ), f"Expected current.location to be None, but got {nf.current.location!r}" + + def test_update_current_with_oxidation_state(self): + """Test update_current with oxidation_state parameter.""" + data = { + "name": "Iron(II) oxide", + "context": "air", + "unit": "kg", + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + # Note: oxidation_state is extracted from name during normalization + # This test verifies we can update it if needed + from flowmapper.fields import OxidationState + + nf.update_current(oxidation_state=3) + assert ( + nf.current.oxidation_state.value == 3 + ), f"Expected current.oxidation_state to be 3, but got {nf.current.oxidation_state.value if nf.current.oxidation_state else None!r}" + + +class TestNormalizedFlowRepr: + """Test NormalizedFlow __repr__ method.""" + + def test_repr_basic_normalized_flow(self): + """Test NormalizedFlow __repr__ with basic flow.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + result = repr(nf) + assert "NormalizedFlow(" in result, "Expected 'NormalizedFlow(' in repr" + assert "original=" in result, "Expected 'original=' in repr" + assert "current=" in result, "Expected 'current=' in repr" + assert "matched=" in result, "Expected 'matched=' in repr" + + def test_repr_shows_original_and_current(self): + """Test NormalizedFlow __repr__ shows both original and current flows.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + result = repr(nf) + # Check that original Flow repr is included + assert "Flow(" in result, "Expected 'Flow(' in repr (from original or current)" + # Check that both original and current are represented + assert result.count("Flow(") >= 2, "Expected at least 2 Flow() representations" + + def test_repr_with_matched_true(self): + """Test NormalizedFlow __repr__ with matched=True.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, + normalized=normalized, + current=copy(normalized), + matched=True, + ) + + result = repr(nf) + assert "matched=True" in result, "Expected 'matched=True' in repr" + + def test_repr_with_matched_false(self): + """Test NormalizedFlow __repr__ with matched=False.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, + normalized=normalized, + current=copy(normalized), + matched=False, + ) + + result = repr(nf) + assert "matched=False" in result, "Expected 'matched=False' in repr" + + def test_repr_with_modified_current(self): + """Test NormalizedFlow __repr__ shows modified current flow.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + # Modify current + nf.update_current(name="Modified name") + + result = repr(nf) + assert ( + "Modified name" in result or "modified name" in result + ), "Expected modified name in repr" + # Original should still be in repr + assert ( + "Carbon dioxide" in result or "carbon dioxide" in result + ), "Expected original name in repr" + + def test_repr_with_all_fields(self): + """Test NormalizedFlow __repr__ with flows containing all fields.""" + data = { + "name": "Carbon dioxide, in air", + "context": ["Raw", "(unspecified)"], + "unit": "kg", + "identifier": "test-id-123", + "location": "US", + "cas_number": "000124-38-9", + "synonyms": ["CO2"], + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + result = repr(nf) + # Should include information from both original and current + assert "original=" in result, "Expected 'original=' in repr" + assert "current=" in result, "Expected 'current=' in repr" + # The Flow reprs should include their fields + assert ( + "identifier=" in result or "test-id-123" in result + ), "Expected identifier in repr" + + def test_repr_multiline_format(self): + """Test NormalizedFlow __repr__ uses multiline format.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + result = repr(nf) + # Should be multiline (contains newlines) + assert "\n" in result, "Expected multiline repr format" + assert result.count("\n") >= 2, "Expected at least 2 newlines in repr" + + def test_repr_original_and_current_different(self): + """Test NormalizedFlow __repr__ when original and current differ.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + original = Flow.from_dict(data) + normalized = original.normalize() + nf = NormalizedFlow( + original=original, normalized=normalized, current=copy(normalized) + ) + + # Modify current significantly + nf.update_current(name="Water", unit="g", location="US") + + result = repr(nf) + # Both should be represented + assert "original=" in result, "Expected 'original=' in repr" + assert "current=" in result, "Expected 'current=' in repr" + # Original name should be present + assert ( + "Carbon dioxide" in result or "carbon dioxide" in result + ), "Expected original name in repr" + # Modified name should be present + assert "Water" in result or "water" in result, "Expected modified name in repr" + + +class TestNormalizedFlowFromDict: + """Test NormalizedFlow from_dict static method.""" + + def test_from_dict_creates_normalized_flow(self): + """Test from_dict creates NormalizedFlow from dictionary.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + nf = NormalizedFlow.from_dict(data) + + assert isinstance(nf, NormalizedFlow), "Expected NormalizedFlow instance" + assert nf.original.name.data == "Carbon dioxide", "Expected original name" + assert nf.normalized.name.data == "carbon dioxide", "Expected normalized name" + assert nf.current.name.data == "carbon dioxide", "Expected current name" + + def test_from_dict_sets_original_correctly(self): + """Test from_dict sets original flow correctly.""" + data = { + "name": "Carbon dioxide, NL", + "context": "air", + "unit": "kg", + "location": "US", + } + nf = NormalizedFlow.from_dict(data) + + assert ( + nf.original.name.data == "Carbon dioxide, NL" + ), "Expected original name preserved" + assert nf.original.location == "US", "Expected original location preserved" + + def test_from_dict_sets_normalized_correctly(self): + """Test from_dict sets normalized flow correctly.""" + data = { + "name": "Carbon dioxide, NL", + "context": "air", + "unit": "kg", + } + nf = NormalizedFlow.from_dict(data) + + # Normalized should extract location from name + assert ( + nf.normalized.location == "NL" + ), "Expected normalized location extracted from name" + assert ( + nf.normalized.name.data == "carbon dioxide" + ), "Expected normalized name without location" + + def test_from_dict_sets_current_as_copy_of_normalized(self): + """Test from_dict sets current as copy of normalized.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + nf = NormalizedFlow.from_dict(data) + + assert ( + nf.current.name.data == nf.normalized.name.data + ), "Expected current equals normalized" + assert ( + nf.current is not nf.normalized + ), "Expected current is a copy, not same object" + + +class TestNormalizedFlowUnitCompatible: + """Test NormalizedFlow unit_compatible method.""" + + def test_unit_compatible_same_units(self): + """Test unit_compatible with same units.""" + data1 = {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + data2 = {"name": "Methane", "context": "air", "unit": "kg"} + + nf1 = NormalizedFlow.from_dict(data1) + nf2 = NormalizedFlow.from_dict(data2) + + assert nf1.unit_compatible(nf2) is True, "Expected same units to be compatible" + + def test_unit_compatible_different_compatible_units(self): + """Test unit_compatible with different but compatible units.""" + data1 = {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + data2 = {"name": "Methane", "context": "air", "unit": "g"} + + nf1 = NormalizedFlow.from_dict(data1) + nf2 = NormalizedFlow.from_dict(data2) + + assert nf1.unit_compatible(nf2) is True, "Expected kg and g to be compatible" + + def test_unit_compatible_incompatible_units(self): + """Test unit_compatible with incompatible units.""" + data1 = {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + data2 = {"name": "Water", "context": "water", "unit": "m3"} + + nf1 = NormalizedFlow.from_dict(data1) + nf2 = NormalizedFlow.from_dict(data2) + + assert ( + nf1.unit_compatible(nf2) is False + ), "Expected kg and m3 to be incompatible" + + +class TestNormalizedFlowConversionFactor: + """Test NormalizedFlow conversion_factor method.""" + + def test_conversion_factor_same_units(self): + """Test conversion_factor for same units (should be 1.0).""" + data1 = {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + data2 = {"name": "Methane", "context": "air", "unit": "kg"} + + nf1 = NormalizedFlow.from_dict(data1) + nf2 = NormalizedFlow.from_dict(data2) + + result = nf1.conversion_factor(nf2) + assert result == 1.0, f"Expected conversion_factor to be 1.0, but got {result}" + + def test_conversion_factor_compatible_units(self): + """Test conversion_factor for compatible units.""" + data1 = {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + data2 = {"name": "Methane", "context": "air", "unit": "g"} + + nf1 = NormalizedFlow.from_dict(data1) + nf2 = NormalizedFlow.from_dict(data2) + + result = nf1.conversion_factor(nf2) + assert ( + result == 1000.0 + ), f"Expected conversion_factor to be 1000.0 (kg to g), but got {result}" + + def test_conversion_factor_reverse_direction(self): + """Test conversion_factor in reverse direction.""" + data1 = {"name": "Carbon dioxide", "context": "air", "unit": "g"} + data2 = {"name": "Methane", "context": "air", "unit": "kg"} + + nf1 = NormalizedFlow.from_dict(data1) + nf2 = NormalizedFlow.from_dict(data2) + + result = nf1.conversion_factor(nf2) + assert ( + result == 0.001 + ), f"Expected conversion_factor to be 0.001 (g to kg), but got {result}" + + def test_conversion_factor_incompatible_units(self): + """Test conversion_factor with incompatible units returns NaN.""" + import math + + data1 = {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + data2 = {"name": "Water", "context": "water", "unit": "m3"} + + nf1 = NormalizedFlow.from_dict(data1) + nf2 = NormalizedFlow.from_dict(data2) + + result = nf1.conversion_factor(nf2) + assert math.isnan( + result + ), f"Expected conversion_factor to be NaN for incompatible units, but got {result}" + + def test_conversion_factor_with_transformation_factor(self): + """Test conversion_factor multiplies transformation factor by unit conversion.""" + data1 = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "conversion_factor": 2.5, + } + data2 = {"name": "Methane", "context": "air", "unit": "g"} + + nf1 = NormalizedFlow.from_dict(data1) + nf2 = NormalizedFlow.from_dict(data2) + + result = nf1.conversion_factor(nf2) + # transformation_factor (2.5) * unit_conversion (1000.0 kg to g) = 2500.0 + assert ( + result == 2500.0 + ), f"Expected conversion_factor to be 2500.0 (2.5 * 1000.0), but got {result}" + + def test_conversion_factor_with_transformation_factor_reverse(self): + """Test conversion_factor with transformation factor in reverse direction.""" + data1 = { + "name": "Carbon dioxide", + "context": "air", + "unit": "g", + "conversion_factor": 0.5, + } + data2 = {"name": "Methane", "context": "air", "unit": "kg"} + + nf1 = NormalizedFlow.from_dict(data1) + nf2 = NormalizedFlow.from_dict(data2) + + result = nf1.conversion_factor(nf2) + # transformation_factor (0.5) * unit_conversion (0.001 g to kg) = 0.0005 + assert ( + result == 0.0005 + ), f"Expected conversion_factor to be 0.0005 (0.5 * 0.001), but got {result}" + + def test_conversion_factor_with_transformation_factor_same_units(self): + """Test conversion_factor with transformation factor but same units.""" + data1 = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "conversion_factor": 3.0, + } + data2 = {"name": "Methane", "context": "air", "unit": "kg"} + + nf1 = NormalizedFlow.from_dict(data1) + nf2 = NormalizedFlow.from_dict(data2) + + result = nf1.conversion_factor(nf2) + # transformation_factor (3.0) * unit_conversion (1.0 same units) = 3.0 + assert ( + result == 3.0 + ), f"Expected conversion_factor to be 3.0 (3.0 * 1.0), but got {result}" + + def test_conversion_factor_with_none_transformation_factor(self): + """Test conversion_factor when transformation_factor is None (defaults to 1.0).""" + data1 = {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + data2 = {"name": "Methane", "context": "air", "unit": "g"} + + nf1 = NormalizedFlow.from_dict(data1) + nf2 = NormalizedFlow.from_dict(data2) + + # Ensure conversion_factor is None + assert ( + nf1.current.conversion_factor is None + ), "Expected conversion_factor to be None" + + result = nf1.conversion_factor(nf2) + # None defaults to 1.0, so 1.0 * 1000.0 = 1000.0 + assert ( + result == 1000.0 + ), f"Expected conversion_factor to be 1000.0 (1.0 * 1000.0), but got {result}" + + def test_conversion_factor_with_transformation_factor_zero(self): + """Test conversion_factor with transformation_factor of 0.0. + + Note: Due to Python's 'or' operator behavior, 0.0 is treated as falsy + and defaults to 1.0, so the result is 1.0 * unit_conversion. + """ + data1 = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "conversion_factor": 0.0, + } + data2 = {"name": "Methane", "context": "air", "unit": "g"} + + nf1 = NormalizedFlow.from_dict(data1) + nf2 = NormalizedFlow.from_dict(data2) + + result = nf1.conversion_factor(nf2) + # Due to 'or 1.0', 0.0 is treated as falsy and defaults to 1.0 + # So: 1.0 * unit_conversion (1000.0) = 1000.0 + assert ( + result == 1000.0 + ), f"Expected conversion_factor to be 1000.0 (1.0 * 1000.0 due to 'or' behavior), but got {result}" + + +class TestNormalizedFlowExport: + """Test NormalizedFlow export method.""" + + def test_export_exports_original_flow_data(self): + """Test export exports original flow data.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + nf = NormalizedFlow.from_dict(data) + result = nf.export() + + assert result["name"] == "Carbon dioxide", "Expected original name in export" + assert result["unit"] == "kg", "Expected original unit in export" + # Context.value returns the original value (string in this case) + assert result["context"] == "air", "Expected original context in export" + + def test_export_only_non_none_values(self): + """Test export only includes non-None values.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + nf = NormalizedFlow.from_dict(data) + result = nf.export() + + assert "identifier" not in result, "Expected identifier not in export when None" + assert "location" not in result, "Expected location not in export when None" + + def test_export_includes_location_when_present(self): + """Test export includes location when present.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "location": "NL", + } + nf = NormalizedFlow.from_dict(data) + result = nf.export() + + assert "location" in result, "Expected location in export when present" + assert result["location"] == "NL", "Expected location value in export" + + def test_export_includes_identifier_when_present(self): + """Test export includes identifier when present.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "identifier": "test-id-123", + } + nf = NormalizedFlow.from_dict(data) + result = nf.export() + + assert "identifier" in result, "Expected identifier in export when present" + assert ( + result["identifier"] == "test-id-123" + ), "Expected identifier value in export" + + def test_export_cas_number_correctly(self): + """Test CAS number is exported correctly.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "cas_number": "000124-38-9", + } + nf = NormalizedFlow.from_dict(data) + result = nf.export() + + assert "cas_number" in result, "Expected cas_number in export when present" + # CAS number is exported from normalized flow + assert isinstance(result["cas_number"], str), "Expected cas_number to be string" + + +class TestNormalizedFlowProperties: + """Test NormalizedFlow property accessors.""" + + def test_properties_return_current_flow_values(self): + """Test properties return correct value from current flow.""" + data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "location": "NL", + "identifier": "test-id", + } + nf = NormalizedFlow.from_dict(data) + + assert nf.name == "carbon dioxide", "Expected name property from current" + # Unit is normalized, so "kg" becomes "kilogram" + assert nf.unit == "kilogram", "Expected unit property from current (normalized)" + assert nf.context == ("air",), "Expected context property from current" + assert nf.location == "NL", "Expected location property from current" + assert nf.identifier == "test-id", "Expected identifier property from current" + + def test_properties_reflect_update_current(self): + """Test properties reflect changes after update_current().""" + data = {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + nf = NormalizedFlow.from_dict(data) + + original_name = nf.name + nf.update_current(name="Modified name", unit="g") + + # Name is not normalized when passed to update_current via Flow.from_dict + assert nf.name == "Modified name", "Expected name property to reflect update" + # Unit is not normalized when passed to update_current via Flow.from_dict + assert nf.unit == "g", "Expected unit property to reflect update" + assert nf.name != original_name, "Expected name to change after update" + + def test_properties_reflect_reset_current(self): + """Test properties reflect reset after reset_current().""" + data = {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + nf = NormalizedFlow.from_dict(data) + + normalized_name = nf.name + nf.update_current(name="Modified name") + assert nf.name != normalized_name, "Expected name to change after update" + + nf.reset_current() + assert nf.name == normalized_name, "Expected name to reset after reset_current" diff --git a/tests/unit/matching/test_equivalent_names.py b/tests/unit/matching/test_equivalent_names.py new file mode 100644 index 0000000..213531d --- /dev/null +++ b/tests/unit/matching/test_equivalent_names.py @@ -0,0 +1,121 @@ +"""Unit tests for equivalent_names function.""" + +import pytest + +from flowmapper.matching.specialized import equivalent_names + + +class TestEquivalentNames: + """Test equivalent_names function.""" + + def test_equivalent_with_in_ground_suffix(self): + """Test that names with ', in ground' suffix are equivalent.""" + assert equivalent_names("Carbon dioxide, in ground", "Carbon dioxide") is True + assert equivalent_names("Carbon dioxide", "Carbon dioxide, in ground") is True + + def test_equivalent_with_ion_suffix(self): + """Test that names with ', ion' suffix are equivalent.""" + assert equivalent_names("Carbon dioxide, ion", "Carbon dioxide") is True + assert equivalent_names("Carbon dioxide", "Carbon dioxide, ion") is True + + def test_equivalent_with_in_air_suffix(self): + """Test that names with ', in air' suffix are equivalent.""" + assert equivalent_names("Carbon dioxide, in air", "Carbon dioxide") is True + assert equivalent_names("Carbon dioxide", "Carbon dioxide, in air") is True + + def test_equivalent_with_in_water_suffix(self): + """Test that names with ', in water' suffix are equivalent.""" + assert equivalent_names("Carbon dioxide, in water", "Carbon dioxide") is True + assert equivalent_names("Carbon dioxide", "Carbon dioxide, in water") is True + + def test_equivalent_with_unspecified_origin_suffix(self): + """Test that names with ', unspecified origin' suffix are equivalent.""" + assert ( + equivalent_names("Carbon dioxide, unspecified origin", "Carbon dioxide") + is True + ) + assert ( + equivalent_names("Carbon dioxide", "Carbon dioxide, unspecified origin") + is True + ) + + def test_not_equivalent_different_suffixes(self): + """Test that names with different suffixes are not equivalent.""" + assert ( + equivalent_names("Carbon dioxide, in ground", "Carbon dioxide, in air") + is False + ) + assert ( + equivalent_names("Carbon dioxide, in air", "Carbon dioxide, in water") + is False + ) + + def test_equivalent_biogenic_and_non_fossil(self): + """Test that biogenic and non-fossil names are equivalent.""" + assert equivalent_names("Methane, biogenic", "Methane, non-fossil") is True + assert equivalent_names("Methane, non-fossil", "Methane, biogenic") is True + + def test_biogenic_non_fossil_with_matching_base(self): + """Test biogenic/non-fossil equivalence with matching base names.""" + assert ( + equivalent_names("Carbon dioxide, biogenic", "Carbon dioxide, non-fossil") + is True + ) + assert equivalent_names("Water, biogenic", "Water, non-fossil") is True + + def test_biogenic_non_fossil_with_different_base(self): + """Test that biogenic/non-fossil with different base names are not equivalent.""" + assert equivalent_names("Methane, biogenic", "Ethane, non-fossil") is False + + def test_not_equivalent_different_base_names(self): + """Test that names with different base names are not equivalent.""" + assert equivalent_names("Carbon dioxide", "Carbon monoxide") is False + assert equivalent_names("Methane", "Ethane") is False + + def test_not_equivalent_same_suffix_both_sides(self): + """Test that names with same suffix on both sides are not equivalent.""" + # Both have the same suffix, so they're not equivalent (base names differ) + assert equivalent_names("Carbon dioxide, in air", "Methane, in air") is False + + def test_case_sensitive_base_name(self): + """Test that base name comparison is case-sensitive.""" + assert equivalent_names("Carbon dioxide, in air", "carbon dioxide") is False + assert equivalent_names("carbon dioxide, in air", "Carbon dioxide") is False + + def test_empty_strings(self): + """Test that empty strings are not equivalent.""" + assert equivalent_names("", "") is False + assert equivalent_names("Carbon dioxide", "") is False + assert equivalent_names("", "Carbon dioxide") is False + + def test_suffix_only(self): + """Test that suffix-only strings are handled correctly.""" + # When one string is just the suffix and the other is empty, + # removing the suffix from the first gives an empty string, + # which matches the second empty string, so they're equivalent + assert equivalent_names(", in air", "") is True + assert equivalent_names("", ", in air") is True + + def test_multiple_suffixes_not_supported(self): + """Test that names with multiple supported suffixes are not equivalent.""" + # Note: This tests the current behavior - names with multiple suffixes + # are not handled by the function + assert ( + equivalent_names("Carbon dioxide, in air, ion", "Carbon dioxide") is False + ) + + def test_biogenic_with_other_suffix(self): + """Test that biogenic with other suffix is not equivalent to base.""" + # "Carbon dioxide, biogenic" should not match "Carbon dioxide, in air" + # because biogenic is only equivalent to non-fossil + assert ( + equivalent_names("Carbon dioxide, biogenic", "Carbon dioxide, in air") + is False + ) + + def test_non_fossil_with_other_suffix(self): + """Test that non-fossil with other suffix is not equivalent to base.""" + assert ( + equivalent_names("Carbon dioxide, non-fossil", "Carbon dioxide, in air") + is False + ) diff --git a/tests/unit/matching/test_match_identical_names_target_uuid_identifier.py b/tests/unit/matching/test_match_identical_names_target_uuid_identifier.py new file mode 100644 index 0000000..51a79a2 --- /dev/null +++ b/tests/unit/matching/test_match_identical_names_target_uuid_identifier.py @@ -0,0 +1,624 @@ +"""Unit tests for match_identical_names_target_uuid_identifier function.""" + +from copy import copy + +import pytest + +from flowmapper.domain.flow import Flow +from flowmapper.domain.match_condition import MatchCondition +from flowmapper.domain.normalized_flow import NormalizedFlow +from flowmapper.matching.basic import match_identical_names_target_uuid_identifier + + +class TestMatchIdenticalNamesTargetUuidIdentifier: + """Test match_identical_names_target_uuid_identifier function.""" + + def test_basic_matching_with_uuid_identifier(self): + """Test basic matching when target has valid UUID identifier.""" + source_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + target_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "identifier": "550e8400-e29b-41d4-a716-446655440000", # Valid UUID + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + + matches = match_identical_names_target_uuid_identifier( + source_flows=[source_nf], target_flows=[target_nf] + ) + + assert len(matches) == 1, "Expected one match" + assert matches[0].source == source_flow, "Expected source to match" + assert matches[0].target == target_flow, "Expected target to match" + assert ( + matches[0].condition == MatchCondition.exact + ), "Expected condition to be exact" + assert ( + matches[0].function_name == "match_identical_names_target_uuid_identifier" + ), "Expected correct function name" + + def test_no_match_when_target_has_no_identifier(self): + """Test that no match occurs when target has no identifier.""" + source_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + target_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + # No identifier + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + + matches = match_identical_names_target_uuid_identifier( + source_flows=[source_nf], target_flows=[target_nf] + ) + + assert len(matches) == 0, "Expected no match when target has no identifier" + + def test_no_match_when_target_identifier_not_uuid(self): + """Test that no match occurs when target identifier is not a UUID.""" + source_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + target_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "identifier": "not-a-uuid", # Not a valid UUID + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + + matches = match_identical_names_target_uuid_identifier( + source_flows=[source_nf], target_flows=[target_nf] + ) + + assert ( + len(matches) == 0 + ), "Expected no match when target identifier is not a UUID" + + def test_no_match_when_names_differ(self): + """Test that no match occurs when names differ.""" + source_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + target_data = { + "name": "Methane", # Different name + "context": "air", + "unit": "kg", + "identifier": "550e8400-e29b-41d4-a716-446655440000", + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + + matches = match_identical_names_target_uuid_identifier( + source_flows=[source_nf], target_flows=[target_nf] + ) + + assert len(matches) == 0, "Expected no match when names differ" + + def test_no_match_when_contexts_differ(self): + """Test that no match occurs when contexts differ.""" + source_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + target_data = { + "name": "Carbon dioxide", + "context": "water", # Different context + "unit": "kg", + "identifier": "550e8400-e29b-41d4-a716-446655440000", + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + + matches = match_identical_names_target_uuid_identifier( + source_flows=[source_nf], target_flows=[target_nf] + ) + + assert len(matches) == 0, "Expected no match when contexts differ" + + def test_no_match_when_locations_differ(self): + """Test that no match occurs when locations differ.""" + source_data = { + "name": "Carbon dioxide, NL", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + target_data = { + "name": "Carbon dioxide, DE", # Different location + "context": "air", + "unit": "kg", + "identifier": "550e8400-e29b-41d4-a716-446655440000", + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + + matches = match_identical_names_target_uuid_identifier( + source_flows=[source_nf], target_flows=[target_nf] + ) + + assert len(matches) == 0, "Expected no match when locations differ" + + def test_no_match_when_oxidation_states_differ(self): + """Test that no match occurs when oxidation states differ.""" + source_data = { + "name": "Iron(II) oxide", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + target_data = { + "name": "Iron(III) oxide", # Different oxidation state + "context": "air", + "unit": "kg", + "identifier": "550e8400-e29b-41d4-a716-446655440000", + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + + matches = match_identical_names_target_uuid_identifier( + source_flows=[source_nf], target_flows=[target_nf] + ) + + assert len(matches) == 0, "Expected no match when oxidation states differ" + + def test_matches_with_custom_function_name(self): + """Test that custom function_name parameter is used.""" + source_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + target_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "identifier": "550e8400-e29b-41d4-a716-446655440000", + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + + matches = match_identical_names_target_uuid_identifier( + source_flows=[source_nf], + target_flows=[target_nf], + function_name="custom_function", + ) + + assert len(matches) == 1, "Expected one match" + assert ( + matches[0].function_name == "custom_function" + ), "Expected custom function name" + + def test_matches_with_custom_comment(self): + """Test that custom comment parameter is used.""" + source_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + target_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "identifier": "550e8400-e29b-41d4-a716-446655440000", + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + + matches = match_identical_names_target_uuid_identifier( + source_flows=[source_nf], + target_flows=[target_nf], + comment="Custom comment", + ) + + assert len(matches) == 1, "Expected one match" + assert matches[0].comment == "Custom comment", "Expected custom comment" + + def test_matches_with_custom_match_condition(self): + """Test that custom match_condition parameter is used.""" + source_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + target_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "identifier": "550e8400-e29b-41d4-a716-446655440000", + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + + matches = match_identical_names_target_uuid_identifier( + source_flows=[source_nf], + target_flows=[target_nf], + match_condition=MatchCondition.related, + ) + + assert len(matches) == 1, "Expected one match" + assert ( + matches[0].condition == MatchCondition.related + ), "Expected custom match condition" + + def test_multiple_source_flows_same_group(self): + """Test matching multiple source flows in the same group.""" + source_flows = [] + for i in range(3): + source_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + source_flows.append(source_nf) + + target_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "identifier": "550e8400-e29b-41d4-a716-446655440000", + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + + matches = match_identical_names_target_uuid_identifier( + source_flows=source_flows, target_flows=[target_nf] + ) + + assert len(matches) == 3, "Expected three matches for three source flows" + + def test_filters_targets_without_uuid(self): + """Test that only targets with UUID identifiers are considered.""" + source_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + # Target with UUID - should match + target1_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "identifier": "550e8400-e29b-41d4-a716-446655440000", + } + target1_flow = Flow.from_dict(target1_data) + target1_normalized = target1_flow.normalize() + target1_nf = NormalizedFlow( + original=target1_flow, + normalized=target1_normalized, + current=copy(target1_normalized), + ) + + # Target without identifier - should not match + target2_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + target2_flow = Flow.from_dict(target2_data) + target2_normalized = target2_flow.normalize() + target2_nf = NormalizedFlow( + original=target2_flow, + normalized=target2_normalized, + current=copy(target2_normalized), + ) + + # Target with non-UUID identifier - should not match + target3_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "identifier": "not-a-uuid", + } + target3_flow = Flow.from_dict(target3_data) + target3_normalized = target3_flow.normalize() + target3_nf = NormalizedFlow( + original=target3_flow, + normalized=target3_normalized, + current=copy(target3_normalized), + ) + + matches = match_identical_names_target_uuid_identifier( + source_flows=[source_nf], + target_flows=[target1_nf, target2_nf, target3_nf], + ) + + assert len(matches) == 1, "Expected one match (only target with UUID)" + assert matches[0].target == target1_flow, "Expected match with UUID target" + + def test_uuid_format_validation(self): + """Test that UUID format is strictly validated.""" + source_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + # Invalid UUID formats that should not match + invalid_identifiers = [ + "550e8400-e29b-41d4-a716", # Too short + "550e8400-e29b-41d4-a716-446655440000-extra", # Too long + "550e8400e29b41d4a716446655440000", # Missing hyphens + "550e8400-e29b-41d4-a716-44665544000g", # Invalid character + "550E8400-E29B-41D4-A716-446655440000", # Uppercase (should work but let's test) + ] + + for invalid_id in invalid_identifiers: + target_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "identifier": invalid_id, + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + + matches = match_identical_names_target_uuid_identifier( + source_flows=[source_nf], target_flows=[target_nf] + ) + + # Note: Uppercase UUIDs should actually match (regex allows A-F) + if invalid_id == "550E8400-E29B-41D4-A716-446655440000": + assert ( + len(matches) == 1 + ), f"Expected match for uppercase UUID: {invalid_id}" + else: + assert ( + len(matches) == 0 + ), f"Expected no match for invalid UUID format: {invalid_id}" + + def test_unit_compatibility_required(self): + """Test that only unit-compatible flows are matched.""" + source_data = { + "name": "Water", + "context": "water", + "unit": "m3", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + target_data = { + "name": "Water", + "context": "water", + "unit": "kg", # Incompatible unit + "identifier": "550e8400-e29b-41d4-a716-446655440000", + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + + matches = match_identical_names_target_uuid_identifier( + source_flows=[source_nf], target_flows=[target_nf] + ) + + # get_matches filters by unit compatibility + assert len(matches) == 0, "Expected no match for incompatible units" + + def test_empty_source_flows(self): + """Test with empty source flows list.""" + target_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "identifier": "550e8400-e29b-41d4-a716-446655440000", + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + + matches = match_identical_names_target_uuid_identifier( + source_flows=[], target_flows=[target_nf] + ) + + assert len(matches) == 0, "Expected no matches with empty source flows" + + def test_empty_target_flows(self): + """Test with empty target flows list.""" + source_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + matches = match_identical_names_target_uuid_identifier( + source_flows=[source_nf], target_flows=[] + ) + + assert len(matches) == 0, "Expected no matches with empty target flows" diff --git a/tests/unit/matching/test_match_names_with_suffix_removal.py b/tests/unit/matching/test_match_names_with_suffix_removal.py new file mode 100644 index 0000000..e6dd9a1 --- /dev/null +++ b/tests/unit/matching/test_match_names_with_suffix_removal.py @@ -0,0 +1,413 @@ +"""Unit tests for match_names_with_suffix_removal function.""" + +from copy import copy + +import pytest + +from flowmapper.domain.flow import Flow +from flowmapper.domain.match_condition import MatchCondition +from flowmapper.domain.normalized_flow import NormalizedFlow +from flowmapper.matching.specialized import match_names_with_suffix_removal + + +class TestMatchNamesWithSuffixRemoval: + """Test match_names_with_suffix_removal function.""" + + def test_matches_with_in_air_suffix(self): + """Test matching flows where one has ', in air' suffix.""" + source = NormalizedFlow.from_dict( + {"name": "Carbon dioxide, in air", "context": "air", "unit": "kg"} + ) + target = NormalizedFlow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + matches = match_names_with_suffix_removal( + source_flows=[source], target_flows=[target] + ) + + assert len(matches) == 1 + assert matches[0].source == source.original + assert matches[0].target == target.original + assert matches[0].condition == MatchCondition.close + assert matches[0].function_name == "match_names_with_suffix_removal" + + def test_matches_with_in_ground_suffix(self): + """Test matching flows where one has ', in ground' suffix.""" + source = NormalizedFlow.from_dict( + {"name": "Methane, in ground", "context": "air", "unit": "kg"} + ) + target = NormalizedFlow.from_dict( + {"name": "Methane", "context": "air", "unit": "kg"} + ) + + matches = match_names_with_suffix_removal( + source_flows=[source], target_flows=[target] + ) + + assert len(matches) == 1 + + def test_matches_with_ion_suffix(self): + """Test matching flows where one has ', ion' suffix.""" + source = NormalizedFlow.from_dict( + {"name": "Carbon dioxide, ion", "context": "air", "unit": "kg"} + ) + target = NormalizedFlow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + matches = match_names_with_suffix_removal( + source_flows=[source], target_flows=[target] + ) + + assert len(matches) == 1 + + def test_matches_biogenic_to_non_fossil(self): + """Test matching biogenic to non-fossil flows.""" + source = NormalizedFlow.from_dict( + {"name": "Methane, biogenic", "context": "air", "unit": "kg"} + ) + target = NormalizedFlow.from_dict( + {"name": "Methane, non-fossil", "context": "air", "unit": "kg"} + ) + + matches = match_names_with_suffix_removal( + source_flows=[source], target_flows=[target] + ) + + assert len(matches) == 1 + + def test_requires_matching_context(self): + """Test that flows must have matching context.""" + source = NormalizedFlow.from_dict( + {"name": "Carbon dioxide, in air", "context": "air", "unit": "kg"} + ) + target = NormalizedFlow.from_dict( + { + "name": "Carbon dioxide", + "context": "water", # Different context + "unit": "kg", + } + ) + + matches = match_names_with_suffix_removal( + source_flows=[source], target_flows=[target] + ) + + assert len(matches) == 0 + + def test_requires_matching_oxidation_state(self): + """Test that flows must have matching oxidation state.""" + # Create flows with different oxidation states by using names that + # will be parsed differently + source = NormalizedFlow.from_dict( + {"name": "Iron(II), in air", "context": "air", "unit": "kg"} + ) + target = NormalizedFlow.from_dict( + { + "name": "Iron(III)", # Different oxidation state (III vs II) + "context": "air", + "unit": "kg", + } + ) + + matches = match_names_with_suffix_removal( + source_flows=[source], target_flows=[target] + ) + + assert len(matches) == 0 + + def test_requires_matching_location(self): + """Test that flows must have matching location.""" + source = NormalizedFlow.from_dict( + { + "name": "Carbon dioxide, in air", + "context": "air", + "unit": "kg", + "location": "NL", + } + ) + target = NormalizedFlow.from_dict( + { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "location": "DE", # Different location + } + ) + + matches = match_names_with_suffix_removal( + source_flows=[source], target_flows=[target] + ) + + assert len(matches) == 0 + + def test_matches_with_matching_location(self): + """Test that flows with matching location are matched.""" + source = NormalizedFlow.from_dict( + { + "name": "Carbon dioxide, in air", + "context": "air", + "unit": "kg", + "location": "NL", + } + ) + target = NormalizedFlow.from_dict( + { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "location": "NL", # Same location + } + ) + + matches = match_names_with_suffix_removal( + source_flows=[source], target_flows=[target] + ) + + assert len(matches) == 1 + + def test_matches_with_none_location(self): + """Test that flows with None location match.""" + source = NormalizedFlow.from_dict( + {"name": "Carbon dioxide, in air", "context": "air", "unit": "kg"} + ) + target = NormalizedFlow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + matches = match_names_with_suffix_removal( + source_flows=[source], target_flows=[target] + ) + + assert len(matches) == 1 + + def test_requires_unit_compatibility(self): + """Test that flows must be unit-compatible.""" + source = NormalizedFlow.from_dict( + {"name": "Carbon dioxide, in air", "context": "air", "unit": "kg"} + ) + target = NormalizedFlow.from_dict( + { + "name": "Carbon dioxide", + "context": "air", + "unit": "m3", # Incompatible unit + } + ) + + matches = match_names_with_suffix_removal( + source_flows=[source], target_flows=[target] + ) + + assert len(matches) == 0 + + def test_matches_multiple_sources_same_group(self): + """Test matching multiple source flows in the same group.""" + source1 = NormalizedFlow.from_dict( + {"name": "Carbon dioxide, in air", "context": "air", "unit": "kg"} + ) + source2 = NormalizedFlow.from_dict( + {"name": "Carbon dioxide, in air", "context": "air", "unit": "kg"} + ) + target = NormalizedFlow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + matches = match_names_with_suffix_removal( + source_flows=[source1, source2], target_flows=[target] + ) + + assert len(matches) == 2 + + def test_matches_multiple_targets(self): + """Test matching when multiple target flows match.""" + source = NormalizedFlow.from_dict( + {"name": "Carbon dioxide, in air", "context": "air", "unit": "kg"} + ) + target1 = NormalizedFlow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + target2 = NormalizedFlow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + matches = match_names_with_suffix_removal( + source_flows=[source], target_flows=[target1, target2] + ) + + # get_matches only creates matches when exactly one target remains + # after filtering. If multiple targets match and have the same context, + # no match is created (to avoid ambiguity) + # In this case, both targets have the same context, so no match is created + assert len(matches) == 0 + + def test_case_insensitive_name_matching(self): + """Test that name matching is case-insensitive.""" + source = NormalizedFlow.from_dict( + { + "name": "Carbon Dioxide, in air", # Mixed case + "context": "air", + "unit": "kg", + } + ) + target = NormalizedFlow.from_dict( + {"name": "carbon dioxide", "context": "air", "unit": "kg"} # Lowercase + ) + + matches = match_names_with_suffix_removal( + source_flows=[source], target_flows=[target] + ) + + assert len(matches) == 1 + + def test_custom_function_name(self): + """Test that custom function_name is used.""" + source = NormalizedFlow.from_dict( + {"name": "Carbon dioxide, in air", "context": "air", "unit": "kg"} + ) + target = NormalizedFlow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + matches = match_names_with_suffix_removal( + source_flows=[source], + target_flows=[target], + function_name="custom_match_function", + ) + + assert matches[0].function_name == "custom_match_function" + + def test_custom_comment(self): + """Test that custom comment is used.""" + source = NormalizedFlow.from_dict( + {"name": "Carbon dioxide, in air", "context": "air", "unit": "kg"} + ) + target = NormalizedFlow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + matches = match_names_with_suffix_removal( + source_flows=[source], target_flows=[target], comment="Custom comment" + ) + + assert matches[0].comment == "Custom comment" + + def test_custom_match_condition(self): + """Test that custom match_condition is used.""" + source = NormalizedFlow.from_dict( + {"name": "Carbon dioxide, in air", "context": "air", "unit": "kg"} + ) + target = NormalizedFlow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + matches = match_names_with_suffix_removal( + source_flows=[source], + target_flows=[target], + match_condition=MatchCondition.exact, + ) + + assert matches[0].condition == MatchCondition.exact + + def test_no_match_when_names_not_equivalent(self): + """Test that flows with non-equivalent names don't match.""" + source = NormalizedFlow.from_dict( + {"name": "Carbon dioxide, in air", "context": "air", "unit": "kg"} + ) + target = NormalizedFlow.from_dict( + { + "name": "Carbon monoxide", # Different base name + "context": "air", + "unit": "kg", + } + ) + + matches = match_names_with_suffix_removal( + source_flows=[source], target_flows=[target] + ) + + assert len(matches) == 0 + + def test_matches_with_unspecified_origin_suffix(self): + """Test matching flows with ', unspecified origin' suffix.""" + source = NormalizedFlow.from_dict( + {"name": "Methane, unspecified origin", "context": "air", "unit": "kg"} + ) + target = NormalizedFlow.from_dict( + {"name": "Methane", "context": "air", "unit": "kg"} + ) + + matches = match_names_with_suffix_removal( + source_flows=[source], target_flows=[target] + ) + + assert len(matches) == 1 + + def test_matches_with_in_water_suffix(self): + """Test matching flows with ', in water' suffix.""" + source = NormalizedFlow.from_dict( + {"name": "Carbon dioxide, in water", "context": "water", "unit": "kg"} + ) + target = NormalizedFlow.from_dict( + {"name": "Carbon dioxide", "context": "water", "unit": "kg"} + ) + + matches = match_names_with_suffix_removal( + source_flows=[source], target_flows=[target] + ) + + assert len(matches) == 1 + + def test_matches_reverse_direction(self): + """Test matching when target has suffix and source doesn't.""" + source = NormalizedFlow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + target = NormalizedFlow.from_dict( + {"name": "Carbon dioxide, in air", "context": "air", "unit": "kg"} + ) + + matches = match_names_with_suffix_removal( + source_flows=[source], target_flows=[target] + ) + + assert len(matches) == 1 + + def test_matches_multiple_different_groups(self): + """Test matching multiple groups of flows.""" + source1 = NormalizedFlow.from_dict( + {"name": "Carbon dioxide, in air", "context": "air", "unit": "kg"} + ) + source2 = NormalizedFlow.from_dict( + {"name": "Methane, in ground", "context": "air", "unit": "kg"} + ) + target1 = NormalizedFlow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + target2 = NormalizedFlow.from_dict( + {"name": "Methane", "context": "air", "unit": "kg"} + ) + + matches = match_names_with_suffix_removal( + source_flows=[source1, source2], target_flows=[target1, target2] + ) + + assert len(matches) == 2 + + def test_marked_as_matched(self): + """Test that matched source flows are marked as matched.""" + source = NormalizedFlow.from_dict( + {"name": "Carbon dioxide, in air", "context": "air", "unit": "kg"} + ) + target = NormalizedFlow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + assert source.matched is False + + matches = match_names_with_suffix_removal( + source_flows=[source], target_flows=[target] + ) + + assert len(matches) == 1 + assert source.matched is True diff --git a/tests/unit/test_add_missing_regionalized_flows.py b/tests/unit/test_add_missing_regionalized_flows.py new file mode 100644 index 0000000..36ee3e9 --- /dev/null +++ b/tests/unit/test_add_missing_regionalized_flows.py @@ -0,0 +1,698 @@ +"""Unit tests for add_missing_regionalized_flows function.""" + +from copy import copy + +from flowmapper.domain.flow import Flow +from flowmapper.domain.match_condition import MatchCondition +from flowmapper.domain.normalized_flow import NormalizedFlow +from flowmapper.matching import add_missing_regionalized_flows + + +class TestAddMissingRegionalizedFlows: + """Test add_missing_regionalized_flows function.""" + + def test_basic_functionality_with_enough_regions(self): + """Test basic functionality when there are enough regions in target.""" + # Source flow with location + source_data = { + "name": "Carbon dioxide, NL", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + assert source_nf.location == "NL" + assert source_nf.name == "carbon dioxide" + + # Target flows with different locations (other_regions) + target_flows = [] + for location in ["DE", "FR", "US", "CA"]: + target_data = { + "name": f"Carbon dioxide, {location}", + "context": "air", + "unit": "kg", + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + assert target_nf.name == "carbon dioxide" + target_flows.append(target_nf) + + matches = add_missing_regionalized_flows( + source_flows=[source_nf], target_flows=target_flows + ) + + assert len(matches) == 1, "Expected one match" + assert matches[0].new_target_flow is True, "Expected new_target_flow to be True" + assert ( + matches[0].function_name == "add_missing_regionalized_flows" + ), "Expected correct function name" + assert ( + matches[0].condition == MatchCondition.related + ), "Expected condition to be related" + assert matches[0].source == source_flow, "Expected source to match" + # Target should have the source's location in the name + assert matches[0].target.name == "Carbon dioxide, NL" + # Note: location attribute is not set by copy_with_new_location, only name is updated + + def test_with_other_regions_exists(self): + """Test that matches are created when other regionalized flows exist.""" + source_data = { + "name": "Carbon dioxide, NL", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + # 2 target flows with different locations (other_regions) + target_flows = [] + for location in ["DE", "FR"]: + target_data = { + "name": f"Carbon dioxide, {location}", + "context": "air", + "unit": "kg", + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + target_flows.append(target_nf) + + matches = add_missing_regionalized_flows( + source_flows=[source_nf], target_flows=target_flows + ) + + assert len(matches) == 1, "Expected one match when other_regions exist" + assert ( + matches[0].target.name == "Carbon dioxide, NL" + ), "Expected target name to have source location" + + def test_with_single_other_region(self): + """Test with single other regionalized flow.""" + source_data = { + "name": "Carbon dioxide, NL", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + # 1 target flow with different location + target_data = { + "name": "Carbon dioxide, DE", + "context": "air", + "unit": "kg", + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + + matches = add_missing_regionalized_flows( + source_flows=[source_nf], target_flows=[target_nf] + ) + + assert len(matches) == 1, "Expected one match with single other region" + assert ( + matches[0].target.name == "Carbon dioxide, NL" + ), "Expected target name to have source location" + + def test_unit_compatibility_filtering(self): + """Test that only unit-compatible flows are matched.""" + source_data = { + "name": "Water, NL", + "context": "water", + "unit": "m3", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + # Target flows with incompatible unit + target_flows = [] + for location in ["DE", "FR", "US"]: + target_data = { + "name": f"Water, {location}", + "context": "water", + "unit": "kg", # Different unit + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + target_flows.append(target_nf) + + matches = add_missing_regionalized_flows( + source_flows=[source_nf], target_flows=target_flows + ) + + # Should have no matches if units are incompatible + # (assuming m3 and kg are not compatible) + assert isinstance(matches, list), "Expected list of matches" + + def test_multiple_sources_same_group(self): + """Test with multiple source flows in the same group.""" + source_flows = [] + for i in range(3): + source_data = { + "name": "Carbon dioxide, NL", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + source_flows.append(source_nf) + + # Target flows with different locations + target_flows = [] + for location in ["DE", "FR", "US", "CA"]: + target_data = { + "name": f"Carbon dioxide, {location}", + "context": "air", + "unit": "kg", + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + target_flows.append(target_nf) + + matches = add_missing_regionalized_flows( + source_flows=source_flows, target_flows=target_flows + ) + + # Should create a match for each source flow + assert len(matches) == 3, "Expected three matches for three source flows" + + def test_filters_out_flows_without_location(self): + """Test that source flows without location are filtered out.""" + # Source flow with location + source_with_location = Flow.from_dict( + {"name": "Carbon dioxide, NL", "context": "air", "unit": "kg"} + ) + source_nf_with = NormalizedFlow( + original=source_with_location, + normalized=source_with_location.normalize(), + current=copy(source_with_location.normalize()), + ) + + # Source flow without location + source_without_location = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + source_nf_without = NormalizedFlow( + original=source_without_location, + normalized=source_without_location.normalize(), + current=copy(source_without_location.normalize()), + ) + + # Target flows + target_flows = [] + for location in ["DE", "FR", "US"]: + target_flow = Flow.from_dict( + {"name": f"Carbon dioxide, {location}", "context": "air", "unit": "kg"} + ) + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_flow.normalize(), + current=copy(target_flow.normalize()), + ) + target_flows.append(target_nf) + + matches = add_missing_regionalized_flows( + source_flows=[source_nf_with, source_nf_without], + target_flows=target_flows, + ) + + # Should only match the flow with location + assert len(matches) == 1, "Expected one match (only for flow with location)" + assert ( + matches[0].source == source_with_location + ), "Expected match to be for flow with location" + + def test_different_oxidation_states_not_matched(self): + """Test that flows with different oxidation states are not matched.""" + # Source flow with oxidation state + source_data = { + "name": "Iron(II) oxide, NL", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + # Target flows with different oxidation state (or none) + target_flows = [] + for location in ["DE", "FR", "US"]: + target_data = { + "name": "Iron(III) oxide, " + location, # Different oxidation state + "context": "air", + "unit": "kg", + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + target_flows.append(target_nf) + + matches = add_missing_regionalized_flows( + source_flows=[source_nf], target_flows=target_flows + ) + + # Should not match if oxidation states differ + assert len(matches) == 0, "Expected no matches with different oxidation states" + + def test_different_contexts_not_matched(self): + """Test that flows with different contexts are not matched.""" + source_data = { + "name": "Carbon dioxide, NL", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + # Target flows with different context + target_flows = [] + for location in ["DE", "FR", "US"]: + target_data = { + "name": f"Carbon dioxide, {location}", + "context": "water", # Different context + "unit": "kg", + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + target_flows.append(target_nf) + + matches = add_missing_regionalized_flows( + source_flows=[source_nf], target_flows=target_flows + ) + + assert len(matches) == 0, "Expected no matches with different contexts" + + def test_different_names_not_matched(self): + """Test that flows with different names are not matched.""" + source_data = { + "name": "Carbon dioxide, NL", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + # Target flows with different name + target_flows = [] + for location in ["DE", "FR", "US"]: + target_data = { + "name": f"Water, {location}", # Different name + "context": "air", + "unit": "kg", + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + target_flows.append(target_nf) + + matches = add_missing_regionalized_flows( + source_flows=[source_nf], target_flows=target_flows + ) + + assert len(matches) == 0, "Expected no matches with different names" + + def test_empty_source_flows(self): + """Test with empty source flows list.""" + target_flows = [] + for location in ["DE", "FR", "US"]: + target_flow = Flow.from_dict( + {"name": f"Carbon dioxide, {location}", "context": "air", "unit": "kg"} + ) + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_flow.normalize(), + current=copy(target_flow.normalize()), + ) + target_flows.append(target_nf) + + matches = add_missing_regionalized_flows( + source_flows=[], target_flows=target_flows + ) + + assert len(matches) == 0, "Expected no matches with empty source flows" + + def test_empty_target_flows(self): + """Test with empty target flows list.""" + source_data = { + "name": "Carbon dioxide, NL", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + matches = add_missing_regionalized_flows( + source_flows=[source_nf], target_flows=[] + ) + + assert len(matches) == 0, "Expected no matches with empty target flows" + + def test_conversion_factor_calculated(self): + """Test that conversion factor is calculated correctly.""" + source_data = { + "name": "Water, NL", + "context": "water", + "unit": "m3", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + # Target flows with compatible unit + target_flows = [] + for location in ["DE", "FR", "US"]: + target_data = { + "name": f"Water, {location}", + "context": "water", + "unit": "m3", # Same unit + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + target_flows.append(target_nf) + + matches = add_missing_regionalized_flows( + source_flows=[source_nf], target_flows=target_flows + ) + + if len(matches) > 0: + assert ( + matches[0].conversion_factor == 1.0 + ), "Expected conversion_factor to be calculated (1.0 for same unit)" + + def test_comment_includes_location(self): + """Test that comment includes the location information.""" + source_data = { + "name": "Carbon dioxide, NL", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + target_flows = [] + for location in ["DE", "FR", "US", "CA"]: + target_flow = Flow.from_dict( + {"name": f"Carbon dioxide, {location}", "context": "air", "unit": "kg"} + ) + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_flow.normalize(), + current=copy(target_flow.normalize()), + ) + target_flows.append(target_nf) + + matches = add_missing_regionalized_flows( + source_flows=[source_nf], target_flows=target_flows + ) + + if len(matches) > 0: + assert ( + "location" in matches[0].comment.lower() + ), "Expected comment to mention location" + assert ( + "new target flow" in matches[0].comment.lower() + or "added" in matches[0].comment.lower() + ), "Expected comment to mention new target flow" + + def test_multiple_groups_processed(self): + """Test that multiple groups of source flows are processed.""" + source_flows = [] + # Group 1: Carbon dioxide, NL + source1 = Flow.from_dict( + {"name": "Carbon dioxide, NL", "context": "air", "unit": "kg"} + ) + source_nf1 = NormalizedFlow( + original=source1, + normalized=source1.normalize(), + current=copy(source1.normalize()), + ) + source_flows.append(source_nf1) + + # Group 2: Water, FR + source2 = Flow.from_dict( + {"name": "Water, FR", "context": "water", "unit": "kg"} + ) + source_nf2 = NormalizedFlow( + original=source2, + normalized=source2.normalize(), + current=copy(source2.normalize()), + ) + source_flows.append(source_nf2) + + # Target flows for both groups + target_flows = [] + # For carbon dioxide + for location in ["DE", "US", "CA"]: + target_flow = Flow.from_dict( + {"name": f"Carbon dioxide, {location}", "context": "air", "unit": "kg"} + ) + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_flow.normalize(), + current=copy(target_flow.normalize()), + ) + target_flows.append(target_nf) + + # For water + for location in ["DE", "US", "CA"]: + target_flow = Flow.from_dict( + {"name": f"Water, {location}", "context": "water", "unit": "kg"} + ) + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_flow.normalize(), + current=copy(target_flow.normalize()), + ) + target_flows.append(target_nf) + + matches = add_missing_regionalized_flows( + source_flows=source_flows, target_flows=target_flows + ) + + # Should create matches for both groups + assert len(matches) >= 2, "Expected matches for both groups" + + def test_target_without_location_not_considered(self): + """Test that target flows without location are not considered as other_regions.""" + source_data = { + "name": "Carbon dioxide, NL", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + target_flows = [] + # One target with location + target1 = Flow.from_dict( + {"name": "Carbon dioxide, DE", "context": "air", "unit": "kg"} + ) + target_nf1 = NormalizedFlow( + original=target1, + normalized=target1.normalize(), + current=copy(target1.normalize()), + ) + target_flows.append(target_nf1) + + # One target without location (should not be counted) + target2 = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + target_nf2 = NormalizedFlow( + original=target2, + normalized=target2.normalize(), + current=copy(target2.normalize()), + ) + target_flows.append(target_nf2) + + matches = add_missing_regionalized_flows( + source_flows=[source_nf], target_flows=target_flows + ) + + # Should have matches because other_regions exists (even if only 1) + assert ( + len(matches) == 1 + ), "Expected one match when other_regions exists (even if only 1)" + + def test_with_non_regionalized_target(self): + """Test that uses non-regionalized target when exactly one exists and no other_regions.""" + source_data = { + "name": "Carbon dioxide, NL", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + # One non-regionalized target (no location) + target_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + + matches = add_missing_regionalized_flows( + source_flows=[source_nf], target_flows=[target_nf] + ) + + # Should match because exactly one non_regionalized exists + assert ( + len(matches) == 1 + ), "Expected one match when exactly one non_regionalized exists" + assert ( + matches[0].target.name == "Carbon dioxide, NL" + ), "Expected target name to have source location" + + def test_with_multiple_non_regionalized_targets(self): + """Test that no match when multiple non-regionalized targets exist.""" + source_data = { + "name": "Carbon dioxide, NL", + "context": "air", + "unit": "kg", + } + source_flow = Flow.from_dict(source_data) + source_normalized = source_flow.normalize() + source_nf = NormalizedFlow( + original=source_flow, + normalized=source_normalized, + current=copy(source_normalized), + ) + + # Two non-regionalized targets (should not match) + target_flows = [] + for i in range(2): + target_data = { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + } + target_flow = Flow.from_dict(target_data) + target_normalized = target_flow.normalize() + target_nf = NormalizedFlow( + original=target_flow, + normalized=target_normalized, + current=copy(target_normalized), + ) + target_flows.append(target_nf) + + matches = add_missing_regionalized_flows( + source_flows=[source_nf], target_flows=target_flows + ) + + # Should not match because more than one non_regionalized exists + assert ( + len(matches) == 0 + ), "Expected no match when multiple non_regionalized exist" diff --git a/tests/unit/test_cas.py b/tests/unit/test_cas.py new file mode 100644 index 0000000..8d33088 --- /dev/null +++ b/tests/unit/test_cas.py @@ -0,0 +1,410 @@ +"""Unit tests for CASField class.""" + +import pytest + +from flowmapper.fields import CASField + + +class TestCASFieldInitialization: + """Test CASField initialization.""" + + def test_init_with_valid_cas_string(self): + """Test initialization with valid CAS string.""" + cas = CASField("0000096-49-1") + assert ( + cas.data == "0000096-49-1" + ), f"Expected cas.data to be '0000096-49-1', but got {cas.data!r}" + from collections import UserString + + assert isinstance( + cas, UserString + ), f"Expected cas to be an instance of UserString, but got {type(cas)}" + + def test_init_with_empty_string_raises_error(self): + """Test initialization with empty string raises ValueError.""" + with pytest.raises(ValueError, match="Given input is not valid CAS formatting"): + CASField("") + + def test_init_with_none_raises_error(self): + """Test initialization with None raises TypeError.""" + with pytest.raises(TypeError, match="CASField takes only `str`, but got"): + CASField(None) # type: ignore[arg-type] + + def test_init_with_integer_raises_error(self): + """Test initialization with integer raises TypeError.""" + with pytest.raises(TypeError, match="CASField takes only `str`, but got"): + CASField(96491) # type: ignore[arg-type] + + def test_init_with_userstring(self): + """Test initialization with UserString works.""" + from collections import UserString + + us = UserString("7782-40-3") + # CASField converts UserString to string before regex search, so it works + cas = CASField(us) + assert ( + cas.data == "7782-40-3" + ), f"Expected cas.data to be '7782-40-3', but got {cas.data!r}" + assert isinstance( + cas, CASField + ), f"Expected cas to be an instance of CASField, but got {type(cas)}" + + def test_init_with_two_digits_in_front(self): + """Test initialization with whitespace.""" + assert CASField( + "94-75-7" + ), "Initialization with two numbers in front section failed" + + def test_init_with_whitespace(self): + """Test initialization with whitespace.""" + cas = CASField(" 7782-40-3 ") + assert ( + cas.data == " 7782-40-3 " + ), f"Expected cas.data to preserve whitespace, but got {cas.data!r}" + + def test_inherits_from_userstring(self): + """Test that CASField inherits from UserString.""" + cas = CASField("7782-40-3") + from collections import UserString + + assert isinstance( + cas, UserString + ), f"Expected cas to be an instance of UserString, but got {type(cas)}" + # UserString is not a subclass of str + assert not isinstance( + cas, str + ), f"Expected cas to not be an instance of str (UserString is not a subclass), but got {type(cas)}" + + def test_init_with_casfield(self): + """Test initialization with another CASField object.""" + cas1 = CASField("7782-40-3") + cas2 = CASField(cas1) + assert ( + cas2.data == "7782-40-3" + ), f"Expected cas2.data to be '7782-40-3', but got {cas2.data!r}" + assert ( + cas1 == cas2 + ), f"Expected cas1 to equal cas2, but they are not equal (cas1={cas1!r}, cas2={cas2!r})" + assert ( + cas1 is not cas2 + ), "Expected cas1 and cas2 to be different instances, but they are the same instance" + assert isinstance( + cas2, CASField + ), f"Expected cas2 to be an instance of CASField, but got {type(cas2)}" + + +class TestCASFieldDigits: + """Test CASField digits property.""" + + def test_digits_with_dashes(self): + """Test digits property with dashes.""" + cas = CASField("0000096-49-1") + assert cas.digits == [ + 0, + 0, + 0, + 0, + 0, + 9, + 6, + 4, + 9, + 1, + ], f"Expected cas.digits to be [0, 0, 0, 0, 0, 9, 6, 4, 9, 1], but got {cas.digits}" + + def test_digits_without_dashes_raises_error(self): + """Test digits property without dashes raises ValueError.""" + with pytest.raises(ValueError, match="Given input is not valid CAS formatting"): + CASField("0000096491") + + def test_digits_with_empty_string_raises_error(self): + """Test digits property with empty string raises ValueError.""" + with pytest.raises(ValueError, match="Given input is not valid CAS formatting"): + CASField("") + + +class TestCASFieldExport: + """Test CASField export method.""" + + def test_export_with_standard_format(self): + """Test export with standard CAS format.""" + cas = CASField("7782-40-3") + assert ( + cas.export() == "7782-40-3" + ), f"Expected cas.export() to be '7782-40-3', but got {cas.export()!r}" + + def test_export_without_dashes_raises_error(self): + """Test export without dashes raises ValueError.""" + with pytest.raises(ValueError, match="Given input is not valid CAS formatting"): + CASField("7782403") + + def test_export_with_leading_zeros(self): + """Test export with leading zeros.""" + cas = CASField("0007782-40-3") + # Export keeps leading zeros in the first part + assert ( + cas.export() == "0007782-40-3" + ), f"Expected cas.export() to be '0007782-40-3', but got {cas.export()!r}" + + def test_export_with_empty_string_raises_error(self): + """Test export with empty string raises ValueError.""" + with pytest.raises(ValueError, match="Given input is not valid CAS formatting"): + CASField("") + + def test_export_with_single_digit_raises_error(self): + """Test export with single digit raises ValueError.""" + with pytest.raises(ValueError, match="Given input is not valid CAS formatting"): + CASField("1") + + +class TestCASFieldCheckDigitExpected: + """Test CASField check_digit_expected property.""" + + def test_check_digit_expected_valid_cas(self): + """Test check_digit_expected with CAS number.""" + cas = CASField("7732-18-5") + expected = cas.check_digit_expected + assert ( + expected == 5 + ), f"Expected check_digit_expected to be 5, but got {expected}" + + def test_check_digit_expected_invalid_cas(self): + """Test check_digit_expected with invalid CAS number.""" + cas = CASField("7782-40-2") + # Check digit is 2, but expected is 3 + expected = cas.check_digit_expected + assert ( + expected == 3 + ), f"Expected check_digit_expected to be 3, but got {expected}" + + +class TestCASFieldValid: + """Test CASField valid method.""" + + def test_valid_with_invalid_cas(self): + """Test valid with invalid CAS number.""" + cas = CASField("7782-40-2") + assert ( + not cas.valid() + ), f"Expected cas.valid() to be False, but got {cas.valid()}" + + def test_valid_with_leading_zeros(self): + """Test valid with leading zeros.""" + cas = CASField("0000096-49-1") + # Check digit calculation includes leading zeros + is_valid = cas.valid() + assert is_valid and isinstance( + is_valid, bool + ), f"Expected cas.valid() to return a bool, but got {type(is_valid)}" + + +class TestCASFieldFromString: + """Test CASField from_string method.""" + + def test_from_string_with_valid_cas(self): + """Test from_string with valid CAS number.""" + cas = CASField("7782-40-3") + with pytest.raises(ValueError, match="Given input is not valid CAS formatting"): + cas.from_string("000009-49-1") + + def test_from_string_with_whitespace(self): + """Test from_string with whitespace.""" + cas = CASField("7782-40-3") + result = cas.from_string(" 7782-40-3 ") + # Testing actual behavior + assert result is None or isinstance( + result, CASField + ), f"Expected result to be None or CASField, but got {type(result)}" + + def test_from_string_with_invalid_cas(self): + """Test from_string with invalid CAS number.""" + cas = CASField("7782-40-3") + result = cas.from_string("7782-40-2") + # Invalid CAS should return None + assert ( + result is None + ), f"Expected from_string to return None for invalid CAS, but got {result}" + + def test_from_string_with_empty_string(self): + """Test from_string with empty string.""" + cas = CASField("7782-40-3") + # Empty string will fail validation in __init__ + with pytest.raises(ValueError, match="Given input is not valid CAS formatting"): + cas.from_string("") + + def test_from_string_with_none(self): + """Test from_string with None.""" + cas = CASField("7782-40-3") + result = cas.from_string(None) + assert ( + result is None + ), f"Expected from_string to return None for None, but got {result}" + + def test_from_string_returns_new_instance(self): + """Test that from_string returns a new instance when valid.""" + cas = CASField("7782-40-3") + result = cas.from_string("7440-05-3") + if result is not None: + assert ( + result is not cas + ), "Expected from_string() to return a new instance, but it returned the same instance" + assert ( + cas.data == "7782-40-3" + ), f"Expected original cas.data to remain '7782-40-3', but got {cas.data!r}" + + +class TestCASFieldEquality: + """Test CASField equality comparison.""" + + def test_eq_with_same_casfield(self): + """Test equality with same CASField instance (exact data match).""" + cas1 = CASField("7440-05-3") + cas2 = CASField("7440-05-3") + # CASField to CASField comparison uses exact data comparison + assert ( + cas1 == cas2 + ), f"Expected cas1 to equal cas2, but they are not equal (cas1={cas1!r}, cas2={cas2!r})" + + def test_eq_with_different_casfield_data(self): + """Test equality with CASField having different data.""" + cas1 = CASField("7440-05-3") + cas2 = CASField("7782-40-3") + assert ( + cas1 != cas2 + ), f"Expected cas1 to not equal cas2, but they are equal (cas1={cas1!r}, cas2={cas2!r})" + + def test_eq_with_casfield_different_formatting(self): + """Test equality with CASField having same CAS but different formatting.""" + cas1 = CASField("7440-05-3") + cas2 = CASField("0007440-05-3") + # CASField to CASField uses exact data comparison, so formatting matters + assert ( + cas1 != cas2 + ), f"Expected cas1 to not equal cas2 (different formatting), but they are equal (cas1={cas1!r}, cas2={cas2!r})" + + def test_eq_with_string_exact_match(self): + """Test equality with string that exactly matches.""" + cas = CASField("7440-05-3") + assert ( + cas == "7440-05-3" + ), f"Expected cas to equal '7440-05-3', but they are not equal (cas={cas!r})" + + def test_eq_with_string_different_cas(self): + """Test equality with string containing different CAS number.""" + cas = CASField("7440-05-3") + assert ( + cas != "7782-40-3" + ), f"Expected cas to not equal '7782-40-3', but they are equal (cas={cas!r})" + + def test_eq_with_string_leading_zeros(self): + """Test equality with string containing leading zeros (should normalize).""" + cas = CASField("7440-05-3") + # String comparison uses from_string which normalizes (strips leading zeros) + assert ( + cas == "0007440-05-3" + ), f"Expected cas to equal '0007440-05-3' (normalized), but they are not equal (cas={cas!r})" + + def test_eq_with_string_whitespace(self): + """Test equality with string containing whitespace (should normalize).""" + cas = CASField("7440-05-3") + # String comparison uses from_string which normalizes (strips whitespace) + assert ( + cas == " 7440-05-3 " + ), f"Expected cas to equal ' 7440-05-3 ' (normalized), but they are not equal (cas={cas!r})" + + def test_eq_with_string_leading_zeros_and_whitespace(self): + """Test equality with string containing both leading zeros and whitespace.""" + cas = CASField("7440-05-3") + # String comparison normalizes both whitespace and leading zeros + assert ( + cas == " 0007440-05-3 " + ), f"Expected cas to equal ' 0007440-05-3 ' (normalized), but they are not equal (cas={cas!r})" + + def test_eq_with_string_invalid_cas(self): + """Test equality with string containing invalid CAS number.""" + cas = CASField("7440-05-3") + # Invalid CAS strings return None from from_string, so equality is False + assert ( + cas != "7440-05-2" + ), f"Expected cas to not equal '7440-05-2' (invalid check digit), but they are equal (cas={cas!r})" + + def test_eq_with_string_empty_string(self): + """Test equality with empty string raises ValueError.""" + cas = CASField("7440-05-3") + # Empty string is invalid CAS, so from_string raises ValueError when creating CASField + with pytest.raises(ValueError, match="Given input is not valid CAS formatting"): + _ = cas == "" + + def test_eq_with_userstring(self): + """Test equality with UserString.""" + from collections import UserString + + cas = CASField("7440-05-3") + us = UserString("7440-05-3") + # UserString is handled like str in __eq__, so it should normalize + assert ( + cas == us + ), f"Expected cas to equal UserString('7440-05-3'), but they are not equal (cas={cas!r})" + + def test_eq_with_userstring_leading_zeros(self): + """Test equality with UserString containing leading zeros.""" + from collections import UserString + + cas = CASField("7440-05-3") + us = UserString("0007440-05-3") + # UserString should normalize like str + assert ( + cas == us + ), f"Expected cas to equal UserString('0007440-05-3') (normalized), but they are not equal (cas={cas!r})" + + def test_eq_with_other_types(self): + """Test equality with other types returns False.""" + cas = CASField("7440-05-3") + # Non-string, non-CASField types should return False + assert ( + cas != 744053 + ), f"Expected cas to not equal integer, but they are equal (cas={cas!r})" + assert ( + cas != None + ), f"Expected cas to not equal None, but they are equal (cas={cas!r})" + assert ( + cas != [] + ), f"Expected cas to not equal list, but they are equal (cas={cas!r})" + + def test_ne_with_different_casfield(self): + """Test inequality with different CASField.""" + cas1 = CASField("7440-05-3") + cas2 = CASField("7782-40-3") + assert ( + cas1 != cas2 + ), f"Expected cas1 to not equal cas2, but they are equal (cas1={cas1!r}, cas2={cas2!r})" + + def test_ne_with_string(self): + """Test inequality with different string.""" + cas = CASField("7440-05-3") + assert ( + cas != "7782-40-3" + ), f"Expected cas to not equal '7782-40-3', but they are equal (cas={cas!r})" + + +class TestCASFieldStringBehavior: + """Test CASField string behavior (inherited from UserString).""" + + def test_string_operations(self): + """Test that CASField behaves like a string.""" + cas = CASField("7782-40-3") + assert len(cas) == 9, f"Expected len(cas) to be 9, but got {len(cas)}" + assert ( + cas.upper() == "7782-40-3" + ), f"Expected cas.upper() to be '7782-40-3', but got {cas.upper()!r}" + assert cas.startswith( + "778" + ), f"Expected cas.startswith('778') to be True, but got {cas.startswith('778')}" + + def test_string_concatenation_raises_error(self): + """Test that CASField concatenation raises ValueError for invalid format.""" + cas1 = CASField("7782-40-3") + cas2 = CASField("7440-05-3") + # Concatenation creates a string that doesn't match CAS format, so __init__ raises ValueError + with pytest.raises(ValueError, match="Given input is not valid CAS formatting"): + _ = cas1 + " and " + cas2 diff --git a/tests/unit/test_context.py b/tests/unit/test_context.py new file mode 100644 index 0000000..7d71f92 --- /dev/null +++ b/tests/unit/test_context.py @@ -0,0 +1,644 @@ +"""Unit tests for ContextField class.""" + +import pytest + +from flowmapper.fields import ContextField +from flowmapper.utils import MISSING_VALUES + + +class TestContextFieldInitialization: + """Test ContextField initialization.""" + + def test_init_with_string(self): + """Test initialization with string.""" + c = ContextField("Raw/(unspecified)") + assert ( + c.value == "Raw/(unspecified)" + ), f"Expected c.value to be 'Raw/(unspecified)', but got {c.value!r}" + assert isinstance( + c.value, str + ), f"Expected c.value to be a str, but got {type(c.value)}" + + def test_init_with_list(self): + """Test initialization with list.""" + c = ContextField(["Raw", "(unspecified)"]) + assert c.value == [ + "Raw", + "(unspecified)", + ], f"Expected c.value to be ['Raw', '(unspecified)'], but got {c.value!r}" + assert isinstance( + c.value, list + ), f"Expected c.value to be a list, but got {type(c.value)}" + + def test_init_with_tuple(self): + """Test initialization with tuple.""" + c = ContextField(("Raw",)) + assert c.value == ( + "Raw", + ), f"Expected c.value to be ('Raw',), but got {c.value!r}" + assert isinstance( + c.value, tuple + ), f"Expected c.value to be a tuple, but got {type(c.value)}" + + def test_init_with_empty_string(self): + """Test initialization with empty string.""" + c = ContextField("") + assert c.value == "", f"Expected c.value to be '', but got {c.value!r}" + + def test_init_with_empty_list(self): + """Test initialization with empty list.""" + c = ContextField([]) + assert c.value == [], f"Expected c.value to be [], but got {c.value!r}" + + def test_init_with_empty_tuple(self): + """Test initialization with empty tuple.""" + c = ContextField(tuple([])) + assert c.value == (), f"Expected c.value to be (), but got {c.value!r}" + + +class TestContextFieldNormalize: + """Test ContextField normalize method.""" + + def test_normalize_with_string(self): + """Test normalize with string value.""" + c = ContextField("A/B") + normalized = c.normalize() + assert normalized.value == ( + "a", + "b", + ), f"Expected normalized.value to be ('a', 'b'), but got {normalized.value!r}" + assert isinstance( + normalized.value, tuple + ), f"Expected normalized.value to be a tuple, but got {type(normalized.value)}" + assert ( + c.value == "A/B" + ), f"Expected original c.value to remain 'A/B', but got {c.value!r}" + + def test_normalize_with_string_no_slash(self): + """Test normalize with string without slash.""" + c = ContextField("A-B") + normalized = c.normalize() + assert normalized.value == ( + "a-b", + ), f"Expected normalized.value to be ('a-b',), but got {normalized.value!r}" + + def test_normalize_with_list(self): + """Test normalize with list value.""" + c = ContextField(["Raw", "(unspecified)"]) + normalized = c.normalize() + assert normalized.value == ( + "raw", + ), f"Expected normalized.value to be ('raw',), but got {normalized.value!r}" + + def test_normalize_with_only_unspecified(self): + """Test normalize with only unspecified value.""" + # When the only value is unspecified, it should be kept + c = ContextField(["unspecified"]) + normalized = c.normalize() + assert normalized.value == ( + "unspecified", + ), f"Expected normalized.value to be ('unspecified',), but got {normalized.value!r}" + + # Test with (unspecified) in parentheses + c2 = ContextField(["(unspecified)"]) + normalized2 = c2.normalize() + assert normalized2.value == ( + "(unspecified)", + ), f"Expected normalized.value to be ('(unspecified)',), but got {normalized2.value!r}" + + # Test with string "unspecified" + c3 = ContextField("unspecified") + normalized3 = c3.normalize() + assert normalized3.value == ( + "unspecified", + ), f"Expected normalized.value to be ('unspecified',), but got {normalized3.value!r}" + + # Test with multipleunspecified in parentheses + c2 = ContextField(["(unspecified)", "(unspecified)"]) + normalized2 = c2.normalize() + assert normalized2.value == ( + "(unspecified)", + ), f"Expected normalized.value to be ('(unspecified)',), but got {normalized2.value!r}" + + def test_normalize_with_tuple(self): + """Test normalize with tuple value.""" + c = ContextField(("Raw",)) + normalized = c.normalize() + assert normalized.value == ( + "raw", + ), f"Expected normalized.value to be ('raw',), but got {normalized.value!r}" + + def test_normalize_with_obj_parameter(self): + """Test normalize with obj parameter.""" + c = ContextField("X/Y") + normalized = c.normalize("A/B") + assert normalized.value == ( + "a", + "b", + ), f"Expected normalized.value to be ('a', 'b'), but got {normalized.value!r}" + assert ( + c.value == "X/Y" + ), f"Expected original c.value to remain 'X/Y', but got {c.value!r}" + + def test_normalize_lowercase(self): + """Test normalize converts to lowercase.""" + c = ContextField("A-B") + normalized = c.normalize() + assert normalized.value == ( + "a-b", + ), f"Expected normalized.value to be ('a-b',), but got {normalized.value!r}" + + def test_normalize_strip(self): + """Test normalize strips whitespace.""" + c = ContextField(" A-B\t\n") + normalized = c.normalize() + assert normalized.value == ( + "a-b", + ), f"Expected normalized.value to be ('a-b',), but got {normalized.value!r}" + + def test_normalize_removes_trailing_missing_values(self): + """Test normalize removes trailing missing values.""" + c = ContextField(("A", "(unknown)")) + normalized = c.normalize() + assert normalized.value == ( + "a", + ), f"Expected normalized.value to be ('a',), but got {normalized.value!r}" + + @pytest.mark.parametrize("missing_value", MISSING_VALUES) + def test_normalize_removes_trailing_missing_value(self, missing_value): + """Test normalize removes trailing missing values.""" + c = ContextField(("A", missing_value)) + normalized = c.normalize() + assert normalized.value == ( + "a", + ), f"Expected normalized.value to be ('a',) for missing value {missing_value!r}, but got {normalized.value!r}" + + def test_normalize_removes_multiple_trailing_missing_values(self): + """Test normalize removes multiple trailing missing values.""" + c = ContextField(("A", "(unknown)", "(unspecified)")) + normalized = c.normalize() + assert normalized.value == ( + "a", + ), f"Expected normalized.value to be ('a',), but got {normalized.value!r}" + + def test_normalize_does_not_remove_leading_missing_values(self): + """Test normalize does not remove leading missing values.""" + c = ContextField(("(unknown)", "A")) + normalized = c.normalize() + assert normalized.value == ( + "(unknown)", + "a", + ), f"Expected normalized.value to be ('(unknown)', 'a'), but got {normalized.value!r}" + + def test_normalize_returns_new_instance(self): + """Test that normalize returns a new instance.""" + c = ContextField("A/B") + normalized = c.normalize() + assert ( + normalized is not c + ), "Expected normalize() to return a new instance, but it returned the same instance" + assert ( + c.value == "A/B" + ), f"Expected original c.value to remain 'A/B', but got {c.value!r}" + + def test_normalize_with_invalid_type_raises_error(self): + """Test normalize with invalid type raises ValueError.""" + + class Foo: + pass + + c = ContextField("A/B") + with pytest.raises(ValueError, match="Can't understand input context"): + c.normalize(Foo()) + + +class TestContextFieldExportAsString: + """Test ContextField export_as_string method.""" + + def test_export_as_string_with_list(self): + """Test export_as_string with list value.""" + c = ContextField(["A", "B"]) + result = c.export_as_string() + assert ( + result == "A✂️B" + ), f"Expected export_as_string() to be 'A✂️B', but got {result!r}" + + def test_export_as_string_with_tuple(self): + """Test export_as_string with tuple value.""" + c = ContextField(("A", "B")) + result = c.export_as_string() + assert ( + result == "A✂️B" + ), f"Expected export_as_string() to be 'A✂️B', but got {result!r}" + + def test_export_as_string_with_string(self): + """Test export_as_string with string value.""" + c = ContextField("A/B") + result = c.export_as_string() + assert ( + result == "A/B" + ), f"Expected export_as_string() to be 'A/B', but got {result!r}" + + def test_export_as_string_with_custom_join_character_list(self): + """Test export_as_string with custom join_character for list value.""" + c = ContextField(["A", "B"]) + result = c.export_as_string("/") + assert ( + result == "A/B" + ), f"Expected export_as_string('/') to be 'A/B', but got {result!r}" + + def test_export_as_string_with_custom_join_character_tuple(self): + """Test export_as_string with custom join_character for tuple value.""" + c = ContextField(("A", "B", "C")) + result = c.export_as_string("|") + assert ( + result == "A|B|C" + ), f"Expected export_as_string('|') to be 'A|B|C', but got {result!r}" + + def test_export_as_string_with_custom_join_character_dash(self): + """Test export_as_string with custom join_character '-'.""" + c = ContextField(["A", "B"]) + result = c.export_as_string("-") + assert ( + result == "A-B" + ), f"Expected export_as_string('-') to be 'A-B', but got {result!r}" + + def test_export_as_string_with_custom_join_character_string_value(self): + """Test export_as_string with custom join_character for string value (should not use join_character).""" + c = ContextField("A/B") + result = c.export_as_string("/") + # String values are returned as-is, join_character is not used + assert ( + result == "A/B" + ), f"Expected export_as_string('/') to be 'A/B' for string value, but got {result!r}" + + def test_export_as_string_with_custom_join_character_empty_string(self): + """Test export_as_string with custom join_character as empty string.""" + c = ContextField(["A", "B"]) + result = c.export_as_string("") + assert ( + result == "AB" + ), f"Expected export_as_string('') to be 'AB', but got {result!r}" + + def test_export_as_string_with_custom_join_character_space(self): + """Test export_as_string with custom join_character as space.""" + c = ContextField(["A", "B", "C"]) + result = c.export_as_string(" ") + assert ( + result == "A B C" + ), f"Expected export_as_string(' ') to be 'A B C', but got {result!r}" + + +class TestContextFieldEq: + """Test ContextField __eq__ method.""" + + def test_eq_with_same_contextfield(self): + """Test equality with same ContextField instance.""" + c1 = ContextField("A/B") + c2 = ContextField("A/B") + assert ( + c1 == c2 + ), f"Expected c1 to equal c2, but they are not equal (c1={c1!r}, c2={c2!r})" + + def test_eq_with_different_contextfield(self): + """Test equality with different ContextField.""" + c1 = ContextField("A/B") + c2 = ContextField("X/Y") + assert ( + c1 != c2 + ), f"Expected c1 to not equal c2, but they are equal (c1={c1!r}, c2={c2!r})" + + def test_eq_with_list_and_string(self): + """Test equality with list and string values.""" + c1 = ContextField("A/B") + c2 = ContextField(["A", "B"]) + # Different value types, so not equal + assert ( + c1 != c2 + ), f"Expected c1 to not equal c2, but they are equal (c1={c1!r}, c2={c2!r})" + + def test_eq_with_string_other(self): + """Test equality with string other.""" + c = ContextField("A/B") + # __eq__ normalizes the other value and compares + # "A/B" normalized is ('a', 'b'), but c.value is "A/B", so not equal + assert ( + c != "A/B" + ), f"Expected c to not equal 'A/B', but they are equal (c={c!r})" + + def test_eq_with_empty_contextfield(self): + """Test equality with empty ContextField.""" + c1 = ContextField("") + c2 = ContextField("") + # Empty strings are falsy, so __eq__ goes to else branch + # Empty string normalizes to ('',), so c1.value ("") != normalized c2.value (('',)) + assert ( + c1 != c2 + ), f"Expected c1 to not equal c2 for empty strings, but they are equal (c1={c1!r}, c2={c2!r})" + + def test_eq_with_other_type(self): + """Test equality with non-ContextField type.""" + c = ContextField("A/B") + assert c != 123, f"Expected c to not equal 123, but they are equal (c={c!r})" + assert c != None, f"Expected c to not equal None, but they are equal (c={c!r})" + assert c != [], f"Expected c to not equal [], but they are equal (c={c!r})" + + +class TestContextFieldBool: + """Test ContextField __bool__ method.""" + + def test_bool_with_non_empty_string(self): + """Test __bool__ with non-empty string.""" + c = ContextField("A/B") + assert bool(c) is True, f"Expected bool(c) to be True, but got {bool(c)}" + + def test_bool_with_empty_string(self): + """Test __bool__ with empty string.""" + c = ContextField("") + assert bool(c) is False, f"Expected bool(c) to be False, but got {bool(c)}" + + def test_bool_with_non_empty_list(self): + """Test __bool__ with non-empty list.""" + c = ContextField(["A", "B"]) + assert bool(c) is True, f"Expected bool(c) to be True, but got {bool(c)}" + + def test_bool_with_empty_list(self): + """Test __bool__ with empty list.""" + c = ContextField([]) + assert bool(c) is False, f"Expected bool(c) to be False, but got {bool(c)}" + + def test_bool_with_non_empty_tuple(self): + """Test __bool__ with non-empty tuple.""" + c = ContextField(("A",)) + assert bool(c) is True, f"Expected bool(c) to be True, but got {bool(c)}" + + def test_bool_with_empty_tuple(self): + """Test __bool__ with empty tuple.""" + c = ContextField(()) + assert bool(c) is False, f"Expected bool(c) to be False, but got {bool(c)}" + + +class TestContextFieldHash: + """Test ContextField __hash__ method.""" + + def test_hash_with_string(self): + """Test __hash__ with string value.""" + c = ContextField("A/B") + result = hash(c) + assert isinstance( + result, int + ), f"Expected hash(c) to be an int, but got {type(result)}" + + def test_hash_with_list_raises_error(self): + """Test __hash__ with list value raises TypeError.""" + c = ContextField(["A", "B"]) + # Lists are not hashable, so hash() raises TypeError + with pytest.raises(TypeError): + _ = hash(c) + + def test_hash_with_tuple(self): + """Test __hash__ with tuple value.""" + c = ContextField(("A", "B")) + result = hash(c) + assert isinstance( + result, int + ), f"Expected hash(c) to be an int, but got {type(result)}" + + def test_hash_same_values(self): + """Test __hash__ with same values.""" + c1 = ContextField("A/B") + c2 = ContextField("A/B") + assert hash(c1) == hash( + c2 + ), f"Expected hash(c1) to equal hash(c2), but got {hash(c1)} and {hash(c2)}" + + +class TestContextFieldIter: + """Test ContextField __iter__ method.""" + + def test_iter_with_string(self): + """Test __iter__ with string value.""" + c = ContextField("A/B") + result = list(c) + assert result == [ + "A", + "/", + "B", + ], f"Expected list(c) to be ['A', '/', 'B'], but got {result!r}" + + def test_iter_with_list(self): + """Test __iter__ with list value.""" + c = ContextField(["A", "B"]) + result = list(c) + assert result == [ + "A", + "B", + ], f"Expected list(c) to be ['A', 'B'], but got {result!r}" + + def test_iter_with_tuple(self): + """Test __iter__ with tuple value.""" + c = ContextField(("A", "B")) + result = list(c) + assert result == [ + "A", + "B", + ], f"Expected list(c) to be ['A', 'B'], but got {result!r}" + + +class TestContextFieldContains: + """Test ContextField __contains__ method.""" + + def test_contains_with_string_values(self): + """Test __contains__ with string values.""" + c1 = ContextField("A") + c2 = ContextField("A/B") + # c2 in c1 means c1 is more generic than c2 + # This checks if c1.value == c2.value[:len(c1.value)] + # "A" == "A/B"[:1] -> "A" == "A" -> True + assert ( + c2 in c1 + ), f"Expected c2 to be in c1, but it was not (c1={c1!r}, c2={c2!r})" + assert ( + c1 not in c2 + ), f"Expected c1 to not be in c2, but it was (c1={c1!r}, c2={c2!r})" + + def test_contains_with_tuple_values(self): + """Test __contains__ with tuple values.""" + c1 = ContextField(("A",)) + c2 = ContextField(("A", "B")) + # c2 in c1 means c1 is more generic than c2 + # This checks if c1.value == c2.value[:len(c1.value)] + # ("A",) == ("A", "B")[:1] -> ("A",) == ("A",) -> True + assert ( + c2 in c1 + ), f"Expected c2 to be in c1, but it was not (c1={c1!r}, c2={c2!r})" + assert ( + c1 not in c2 + ), f"Expected c1 to not be in c2, but it was (c1={c1!r}, c2={c2!r})" + + def test_contains_with_list_values(self): + """Test __contains__ with list values.""" + c1 = ContextField(["A"]) + c2 = ContextField(["A", "B"]) + # c2 in c1 means c1 is more generic than c2 + assert ( + c2 in c1 + ), f"Expected c2 to be in c1, but it was not (c1={c1!r}, c2={c2!r})" + assert ( + c1 not in c2 + ), f"Expected c1 to not be in c2, but it was (c1={c1!r}, c2={c2!r})" + + def test_contains_with_non_contextfield(self): + """Test __contains__ with non-ContextField returns False.""" + c = ContextField("A/B") + assert "A/B" not in c, f"Expected 'A/B' to not be in c, but it was (c={c!r})" + assert 123 not in c, f"Expected 123 to not be in c, but it was (c={c!r})" + + +class TestContextFieldRepr: + """Test ContextField __repr__ method.""" + + def test_repr_with_string(self): + """Test __repr__ with string value.""" + c = ContextField("A/B") + result = repr(c) + assert result == "A/B", f"Expected repr(c) to be 'A/B', but got {result!r}" + + def test_repr_with_list(self): + """Test __repr__ with list value.""" + c = ContextField(["A", "B"]) + result = repr(c) + assert ( + result == "['A', 'B']" + ), f"Expected repr(c) to be '['A', 'B']', but got {result!r}" + + def test_repr_with_tuple(self): + """Test __repr__ with tuple value.""" + c = ContextField(("A", "B")) + result = repr(c) + assert ( + result == "('A', 'B')" + ), f"Expected repr(c) to be '('A', 'B')', but got {result!r}" + + +class TestContextFieldEdgeCases: + """Test ContextField edge cases.""" + + def test_normalize_preserves_original_value(self): + """Test that normalize preserves original value.""" + c = ContextField("ORIGINAL") + normalized = c.normalize() + assert ( + c.value == "ORIGINAL" + ), f"Expected original c.value to remain 'ORIGINAL', but got {c.value!r}" + assert normalized.value == ( + "original", + ), f"Expected normalized.value to be ('original',), but got {normalized.value!r}" + + def test_multiple_normalize_calls(self): + """Test multiple normalize calls.""" + c = ContextField(" TEST ") + norm1 = c.normalize() + norm2 = norm1.normalize() + assert norm1.value == ( + "test", + ), f"Expected norm1.value to be ('test',), but got {norm1.value!r}" + assert norm2.value == ( + "test", + ), f"Expected norm2.value to be ('test',), but got {norm2.value!r}" + + def test_normalize_with_mapping_parameter(self): + """Test normalize with mapping parameter (currently not implemented).""" + c = ContextField("A/B") + # mapping parameter is accepted but not used (TODO in code) + normalized = c.normalize(mapping={"A": "X"}) + assert normalized.value == ( + "a", + "b", + ), f"Expected normalized.value to be ('a', 'b'), but got {normalized.value!r}" + + +class TestContextFieldIsResource: + """Test ContextField is_resource method.""" + + @pytest.mark.parametrize( + "value,expected", + [ + # String values that should return True (resource categories) + ("resource", True), + ("resources", True), + ("natural resource", True), + ("natural resources", True), + ("land use", True), + ("economic", True), + ("social", True), + ("raw materials", True), + ("raw", True), + # Case insensitivity + ("RESOURCE", True), + ("Natural Resource", True), + # Substring matches + ("water resource extraction", True), + ("natural resource extraction", True), + ("economic activity", True), + ("social aspect", True), + # Slash-separated strings with resource + ("resource/air", True), + # String values that should return False + ("emission", False), + ("air", False), + ("water", False), + ("", False), + ("emission/air", False), + ], + ) + def test_is_resource_with_string(self, value, expected): + """Test is_resource with string values.""" + c = ContextField(value) + assert ( + c.is_resource() is expected + ), f"Expected is_resource() to be {expected} for {value!r}, but got {c.is_resource()}" + + @pytest.mark.parametrize( + "value,expected", + [ + # List values that should return True + (["resource"], True), + (["resources"], True), + (["raw"], True), + (["land use"], True), + (["economic"], True), + (["social"], True), + (["raw materials"], True), + (["RESOURCE"], True), # Case insensitive + (["emission", "resource", "air"], True), # Multiple elements, one resource + # List values that should return False + (["emission", "air", "water"], False), + ([], False), + ], + ) + def test_is_resource_with_list(self, value, expected): + """Test is_resource with list values.""" + c = ContextField(value) + assert ( + c.is_resource() is expected + ), f"Expected is_resource() to be {expected} for {value!r}, but got {c.is_resource()}" + + @pytest.mark.parametrize( + "value,expected", + [ + # Tuple values that should return True + (("resource",), True), + (("raw",), True), + (("emission", "resource", "air"), True), # Multiple elements, one resource + # Tuple values that should return False + (("emission", "air"), False), + ((), False), + ], + ) + def test_is_resource_with_tuple(self, value, expected): + """Test is_resource with tuple values.""" + c = ContextField(value) + assert ( + c.is_resource() is expected + ), f"Expected is_resource() to be {expected} for {value!r}, but got {c.is_resource()}" diff --git a/tests/unit/test_flowmap.py b/tests/unit/test_flowmap.py new file mode 100644 index 0000000..483d8be --- /dev/null +++ b/tests/unit/test_flowmap.py @@ -0,0 +1,1274 @@ +"""Unit tests for Flowmap class using mocks.""" + +from copy import copy +from pathlib import Path +from unittest.mock import MagicMock, Mock, patch + +import pandas as pd +import pytest + +from flowmapper.domain.flow import Flow +from flowmapper.domain.match import Match +from flowmapper.domain.match_condition import MatchCondition +from flowmapper.domain.normalized_flow import NormalizedFlow +from flowmapper.flowmap import Flowmap + + +class TestFlowmapInit: + """Test Flowmap __init__ method.""" + + @patch("flowmapper.flowmap.match_rules") + def test_init_with_default_rules(self, mock_match_rules): + """Test initialization with default rules.""" + mock_rules = [Mock(), Mock()] + mock_match_rules.return_value = mock_rules + + source_flows = [Mock(spec=NormalizedFlow)] + target_flows = [Mock(spec=NormalizedFlow)] + data_prep_funcs = [Mock()] + + flowmap = Flowmap( + source_flows=source_flows, + target_flows=target_flows, + data_preparation_functions=data_prep_funcs, + ) + + assert flowmap.source_flows == source_flows + assert flowmap.target_flows == target_flows + assert flowmap.data_preparation_functions == data_prep_funcs + assert flowmap.rules == mock_rules + assert flowmap.matches == [] + assert flowmap.show_progressbar is True + mock_match_rules.assert_called_once() + + def test_init_with_custom_rules(self): + """Test initialization with custom rules.""" + source_flows = [Mock(spec=NormalizedFlow)] + target_flows = [Mock(spec=NormalizedFlow)] + data_prep_funcs = [Mock()] + custom_rules = [Mock(), Mock()] + + flowmap = Flowmap( + source_flows=source_flows, + target_flows=target_flows, + data_preparation_functions=data_prep_funcs, + rules=custom_rules, + ) + + assert flowmap.rules == custom_rules + + def test_init_with_show_progressbar_false(self): + """Test initialization with show_progressbar=False.""" + source_flows = [Mock(spec=NormalizedFlow)] + target_flows = [Mock(spec=NormalizedFlow)] + data_prep_funcs = [Mock()] + + flowmap = Flowmap( + source_flows=source_flows, + target_flows=target_flows, + data_preparation_functions=data_prep_funcs, + show_progressbar=False, + ) + + assert flowmap.show_progressbar is False + + +class TestFlowmapGenerateMatches: + """Test Flowmap generate_matches method.""" + + @patch("flowmapper.flowmap.logger") + @patch("flowmapper.flowmap.time") + def test_generate_matches_applies_rules(self, mock_time, mock_logger): + """Test that generate_matches applies all rules.""" + # time() is called once per rule for start time, then again for elapsed + # Provide enough values: start1, end1, start2, end2 + mock_time.side_effect = [0.0, 1.0, 1.0, 2.0] + + # Create mock flows + source_flow1 = Mock(spec=NormalizedFlow) + source_flow1.matched = False + source_flow2 = Mock(spec=NormalizedFlow) + source_flow2.matched = False + target_flow = Mock(spec=NormalizedFlow) + + # Create mock matches + match1 = Mock(spec=Match) + match1.new_target_flow = False + match2 = Mock(spec=Match) + match2.new_target_flow = False + + # Create mock rules + rule1 = Mock() + rule1.__name__ = "rule1" + rule1.return_value = [match1] + + rule2 = Mock() + rule2.__name__ = "rule2" + rule2.return_value = [match2] + + flowmap = Flowmap( + source_flows=[source_flow1, source_flow2], + target_flows=[target_flow], + data_preparation_functions=[], + rules=[rule1, rule2], + ) + + flowmap.generate_matches() + + # Verify rules were called with unmatched flows + assert rule1.called + assert rule2.called + assert len(flowmap.matches) == 2 + assert flowmap.matches == [match1, match2] + + @patch("flowmapper.flowmap.logger") + @patch("flowmapper.flowmap.time") + def test_generate_matches_filters_matched_flows(self, mock_time, mock_logger): + """Test that generate_matches only passes unmatched flows to rules.""" + mock_time.side_effect = [0.0, 1.0] + + source_flow1 = Mock(spec=NormalizedFlow) + source_flow1.matched = False + source_flow2 = Mock(spec=NormalizedFlow) + source_flow2.matched = True # Already matched + target_flow = Mock(spec=NormalizedFlow) + + rule = Mock() + rule.__name__ = "test_rule" + rule.return_value = [] + + flowmap = Flowmap( + source_flows=[source_flow1, source_flow2], + target_flows=[target_flow], + data_preparation_functions=[], + rules=[rule], + ) + + flowmap.generate_matches() + + # Verify rule was called with only unmatched flow + rule.assert_called_once() + call_args = rule.call_args + assert len(call_args.kwargs["source_flows"]) == 1 + assert call_args.kwargs["source_flows"][0] == source_flow1 + + @patch("flowmapper.flowmap.logger") + @patch("flowmapper.flowmap.time") + def test_generate_matches_adds_new_target_flows(self, mock_time, mock_logger): + """Test that generate_matches adds new target flows when created.""" + mock_time.side_effect = [0.0, 1.0] + + source_flow = Mock(spec=NormalizedFlow) + source_flow.matched = False + target_flow = Mock(spec=NormalizedFlow) + new_target_flow = Mock(spec=Flow) + + match = Mock(spec=Match) + match.new_target_flow = True + match.target = new_target_flow + + rule = Mock() + rule.__name__ = "test_rule" + rule.return_value = [match] + + flowmap = Flowmap( + source_flows=[source_flow], + target_flows=[target_flow], + data_preparation_functions=[], + rules=[rule], + ) + + # Mock the add_new_target_flows method + flowmap.add_new_target_flows = Mock() + + flowmap.generate_matches() + + # Verify add_new_target_flows was called with new target flow + flowmap.add_new_target_flows.assert_called_once_with([new_target_flow]) + + @patch("flowmapper.flowmap.logger") + @patch("flowmapper.flowmap.time") + def test_generate_matches_logs_with_new_target_flows(self, mock_time, mock_logger): + """Test that generate_matches logs correctly when new target flows are created.""" + mock_time.side_effect = [0.0, 1.0] + + source_flow = Mock(spec=NormalizedFlow) + source_flow.matched = False + target_flow = Mock(spec=NormalizedFlow) + new_target_flow = Mock(spec=Flow) + + match = Mock(spec=Match) + match.new_target_flow = True + match.target = new_target_flow + + rule = Mock() + rule.__name__ = "test_rule" + rule.return_value = [match] + + flowmap = Flowmap( + source_flows=[source_flow], + target_flows=[target_flow], + data_preparation_functions=[], + rules=[rule], + ) + flowmap.add_new_target_flows = Mock() + + flowmap.generate_matches() + + # Verify logger was called with message about new target flows + mock_logger.info.assert_called() + call_args = mock_logger.info.call_args[0][0] + assert "new target flows" in call_args.lower() + assert "1" in call_args # 1 new target flow + + @patch("flowmapper.flowmap.logger") + @patch("flowmapper.flowmap.time") + def test_generate_matches_logs_without_new_target_flows( + self, mock_time, mock_logger + ): + """Test that generate_matches logs correctly when no new target flows.""" + mock_time.side_effect = [0.0, 1.0] + + source_flow = Mock(spec=NormalizedFlow) + source_flow.matched = False + target_flow = Mock(spec=NormalizedFlow) + + match = Mock(spec=Match) + match.new_target_flow = False + + rule = Mock() + rule.__name__ = "test_rule" + rule.return_value = [match] + + flowmap = Flowmap( + source_flows=[source_flow], + target_flows=[target_flow], + data_preparation_functions=[], + rules=[rule], + ) + + flowmap.generate_matches() + + # Verify logger was called without mention of new target flows + mock_logger.info.assert_called() + call_args = mock_logger.info.call_args[0][0] + assert "new target flows" not in call_args.lower() + + +class TestFlowmapAddNewTargetFlows: + """Test Flowmap add_new_target_flows method.""" + + @patch( + "flowmapper.flowmap.apply_transformation_and_convert_flows_to_normalized_flows" + ) + def test_add_new_target_flows_normalizes_and_adds(self, mock_apply): + """Test that add_new_target_flows normalizes flows and adds them.""" + new_flow1 = Mock(spec=Flow) + new_flow2 = Mock(spec=Flow) + + normalized_flow1 = Mock(spec=NormalizedFlow) + normalized_flow2 = Mock(spec=NormalizedFlow) + mock_apply.return_value = [normalized_flow1, normalized_flow2] + + flowmap = Flowmap( + source_flows=[], + target_flows=[], + data_preparation_functions=[Mock()], + ) + + flowmap.add_new_target_flows([new_flow1, new_flow2]) + + # Verify flows were normalized + mock_apply.assert_called_once_with( + functions=flowmap.data_preparation_functions, flows=[new_flow1, new_flow2] + ) + + # Verify normalized flows were added + assert len(flowmap.target_flows) == 2 + assert flowmap.target_flows == [normalized_flow1, normalized_flow2] + + +class TestFlowmapMatchedSource: + """Test Flowmap matched_source method.""" + + def test_matched_source_returns_matched_flows(self): + """Test that matched_source returns only matched flows.""" + # Create flows with IDs + source_flow1 = Mock(spec=NormalizedFlow) + source_flow1.id = 1 + source_flow2 = Mock(spec=NormalizedFlow) + source_flow2.id = 2 + source_flow3 = Mock(spec=NormalizedFlow) + source_flow3.id = 3 + + # Create matches with source flows + source_flow_for_match1 = Mock(spec=Flow) + source_flow_for_match1._id = 1 + source_flow_for_match2 = Mock(spec=Flow) + source_flow_for_match2._id = 2 + + match1 = Mock(spec=Match) + match1.source = source_flow_for_match1 + match2 = Mock(spec=Match) + match2.source = source_flow_for_match2 + + flowmap = Flowmap( + source_flows=[source_flow1, source_flow2, source_flow3], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [match1, match2] + + result = flowmap.matched_source() + + assert len(result) == 2 + assert source_flow1 in result + assert source_flow2 in result + assert source_flow3 not in result + + def test_matched_source_returns_empty_when_no_matches(self): + """Test that matched_source returns empty list when no matches.""" + source_flow = Mock(spec=NormalizedFlow) + source_flow.id = 1 + + flowmap = Flowmap( + source_flows=[source_flow], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [] + + result = flowmap.matched_source() + + assert result == [] + + +class TestFlowmapUnmatchedSource: + """Test Flowmap unmatched_source property.""" + + def test_unmatched_source_returns_unmatched_flows(self): + """Test that unmatched_source returns only unmatched flows.""" + # Create flows with IDs + source_flow1 = Mock(spec=NormalizedFlow) + source_flow1.id = 1 + source_flow2 = Mock(spec=NormalizedFlow) + source_flow2.id = 2 + source_flow3 = Mock(spec=NormalizedFlow) + source_flow3.id = 3 + + # Create matches for flow1 and flow2 + source_flow_for_match1 = Mock(spec=Flow) + source_flow_for_match1._id = 1 + source_flow_for_match2 = Mock(spec=Flow) + source_flow_for_match2._id = 2 + + match1 = Mock(spec=Match) + match1.source = source_flow_for_match1 + match2 = Mock(spec=Match) + match2.source = source_flow_for_match2 + + flowmap = Flowmap( + source_flows=[source_flow1, source_flow2, source_flow3], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [match1, match2] + + result = flowmap.unmatched_source + + assert len(result) == 1 + assert source_flow3 in result + assert source_flow1 not in result + assert source_flow2 not in result + + def test_unmatched_source_returns_all_when_no_matches(self): + """Test that unmatched_source returns all flows when no matches.""" + source_flow1 = Mock(spec=NormalizedFlow) + source_flow1.id = 1 + source_flow2 = Mock(spec=NormalizedFlow) + source_flow2.id = 2 + + flowmap = Flowmap( + source_flows=[source_flow1, source_flow2], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [] + + result = flowmap.unmatched_source + + assert len(result) == 2 + assert source_flow1 in result + assert source_flow2 in result + + +class TestFlowmapMatchedSourceStatistics: + """Test Flowmap matched_source_statistics method.""" + + def test_matched_source_statistics_creates_dataframe(self): + """Test that matched_source_statistics returns a DataFrame.""" + # Create flows with contexts + context1 = Mock() + context1.value = "air" + context2 = Mock() + context2.value = "water" + + source_flow1 = Mock(spec=NormalizedFlow) + source_flow1.original = Mock() + source_flow1.original.context = context1 + source_flow2 = Mock(spec=NormalizedFlow) + source_flow2.original = Mock() + source_flow2.original.context = context2 + + # Create matches + match_context1 = Mock() + match_context1.value = "air" + match_context2 = Mock() + match_context2.value = "air" + + match1 = Mock(spec=Match) + match1.source = Mock() + match1.source.context = match_context1 + match2 = Mock(spec=Match) + match2.source = Mock() + match2.source.context = match_context2 + + flowmap = Flowmap( + source_flows=[source_flow1, source_flow2], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [match1, match2] + + result = flowmap.matched_source_statistics() + + assert isinstance(result, pd.DataFrame) + assert "context" in result.columns + assert "matched" in result.columns + assert "total" in result.columns + assert "percent" in result.columns + + def test_matched_source_statistics_calculates_percentages(self): + """Test that matched_source_statistics calculates correct percentages.""" + # Create flows with contexts + air_context = Mock() + air_context.value = "air" + water_context = Mock() + water_context.value = "water" + + source_flow1 = Mock(spec=NormalizedFlow) + source_flow1.original = Mock() + source_flow1.original.context = air_context + source_flow2 = Mock(spec=NormalizedFlow) + source_flow2.original = Mock() + source_flow2.original.context = air_context + source_flow3 = Mock(spec=NormalizedFlow) + source_flow3.original = Mock() + source_flow3.original.context = water_context + + # Create match for one air flow + match_air_context = Mock() + match_air_context.value = "air" + match1 = Mock(spec=Match) + match1.source = Mock() + match1.source.context = match_air_context + + flowmap = Flowmap( + source_flows=[source_flow1, source_flow2, source_flow3], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [match1] + + result = flowmap.matched_source_statistics() + + # Check air context: 1 matched, 2 total + air_row = result[result["context"] == "air"].iloc[0] + assert air_row["matched"] == 1 + assert air_row["total"] == 2 + assert air_row["percent"] == 0.5 + + # Check water context: 0 matched, 1 total + water_row = result[result["context"] == "water"].iloc[0] + assert water_row["matched"] == 0 + assert water_row["total"] == 1 + assert water_row["percent"] == 0.0 + + def test_matched_source_statistics_sorts_by_percent(self): + """Test that matched_source_statistics sorts by percentage.""" + # Create flows with different contexts + air_context = Mock() + air_context.value = "air" + water_context = Mock() + water_context.value = "water" + + source_flow1 = Mock(spec=NormalizedFlow) + source_flow1.original = Mock() + source_flow1.original.context = air_context + source_flow2 = Mock(spec=NormalizedFlow) + source_flow2.original = Mock() + source_flow2.original.context = water_context + + # Create match for air + match_air_context = Mock() + match_air_context.value = "air" + match1 = Mock(spec=Match) + match1.source = Mock() + match1.source.context = match_air_context + + flowmap = Flowmap( + source_flows=[source_flow1, source_flow2], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [match1] + + result = flowmap.matched_source_statistics() + + # Should be sorted by percent ascending + assert result.iloc[0]["percent"] <= result.iloc[1]["percent"] + + +class TestFlowmapMatchedTargetStatistics: + """Test Flowmap matched_target_statistics property.""" + + def test_matched_target_statistics_creates_dataframe(self): + """Test that matched_target_statistics returns a DataFrame.""" + # Create flows with contexts + air_context = Mock() + air_context.value = "air" + water_context = Mock() + water_context.value = "water" + + target_flow1 = Mock(spec=NormalizedFlow) + target_flow1.original = Mock() + target_flow1.original.context = air_context + target_flow2 = Mock(spec=NormalizedFlow) + target_flow2.original = Mock() + target_flow2.original.context = water_context + + # Create matches + match_air_context = Mock() + match_air_context.value = "air" + match1 = Mock(spec=Match) + match1.target = Mock() + match1.target.context = match_air_context + + flowmap = Flowmap( + source_flows=[], + target_flows=[target_flow1, target_flow2], + data_preparation_functions=[], + ) + flowmap.matches = [match1] + + result = flowmap.matched_target_statistics + + assert isinstance(result, pd.DataFrame) + assert "context" in result.columns + assert "matched" in result.columns + assert "total" in result.columns + assert "percent" in result.columns + + def test_matched_target_statistics_calculates_percentages(self): + """Test that matched_target_statistics calculates correct percentages.""" + # Create flows with contexts + air_context = Mock() + air_context.value = "air" + water_context = Mock() + water_context.value = "water" + + target_flow1 = Mock(spec=NormalizedFlow) + target_flow1.original = Mock() + target_flow1.original.context = air_context + target_flow2 = Mock(spec=NormalizedFlow) + target_flow2.original = Mock() + target_flow2.original.context = air_context + target_flow3 = Mock(spec=NormalizedFlow) + target_flow3.original = Mock() + target_flow3.original.context = water_context + + # Create match + match_air_context = Mock() + match_air_context.value = "air" + match1 = Mock(spec=Match) + match1.target = Mock() + match1.target.context = match_air_context + + flowmap = Flowmap( + source_flows=[], + target_flows=[target_flow1, target_flow2, target_flow3], + data_preparation_functions=[], + ) + flowmap.matches = [match1] + + result = flowmap.matched_target_statistics + + # Check air context: 1 matched, 2 total + air_row = result[result["context"] == "air"].iloc[0] + assert air_row["matched"] == 1 + assert air_row["total"] == 2 + assert air_row["percent"] == 0.5 + + +class TestFlowmapPrintStatistics: + """Test Flowmap print_statistics method.""" + + @patch("builtins.print") + def test_print_statistics_outputs_summary(self, mock_print): + """Test that print_statistics outputs correct summary.""" + source_flow = Mock(spec=NormalizedFlow) + target_flow = Mock(spec=NormalizedFlow) + + source_flow_for_match = Mock(spec=Flow) + source_flow_for_match._id = 1 + target_flow_for_match = Mock(spec=Flow) + target_flow_for_match._id = 2 + + match = Mock(spec=Match) + match.source = source_flow_for_match + match.target = target_flow_for_match + + flowmap = Flowmap( + source_flows=[source_flow], + target_flows=[target_flow], + data_preparation_functions=[], + ) + flowmap.matches = [match] + + flowmap.print_statistics() + + # Verify print was called + mock_print.assert_called_once() + output = mock_print.call_args[0][0] + + assert "1 source" in output + assert "1 target" in output + assert "1 mappings" in output + assert "cardinalities" in output.lower() + + @patch("builtins.print") + def test_print_statistics_handles_zero_division(self, mock_print): + """Test that print_statistics handles zero source flows.""" + flowmap = Flowmap( + source_flows=[], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [] + + # Should not raise ZeroDivisionError + flowmap.print_statistics() + mock_print.assert_called_once() + + +class TestFlowmapCardinalities: + """Test Flowmap cardinalities method.""" + + def test_cardinalities_1_to_1(self): + """Test cardinalities for 1:1 relationships.""" + source_flow = Mock(spec=Flow) + source_flow._id = 1 + target_flow = Mock(spec=Flow) + target_flow._id = 2 + + match = Mock(spec=Match) + match.source = source_flow + match.target = target_flow + + flowmap = Flowmap( + source_flows=[], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [match] + + result = flowmap.cardinalities() + + assert len(result) == 1 + assert result[0]["from"] == 1 + assert result[0]["to"] == 2 + assert result[0]["cardinality"] == "1:1" + + def test_cardinalities_1_to_n(self): + """Test cardinalities for 1:N relationships.""" + source_flow = Mock(spec=Flow) + source_flow._id = 1 + target_flow1 = Mock(spec=Flow) + target_flow1._id = 2 + target_flow2 = Mock(spec=Flow) + target_flow2._id = 3 + + match1 = Mock(spec=Match) + match1.source = source_flow + match1.target = target_flow1 + match2 = Mock(spec=Match) + match2.source = source_flow + match2.target = target_flow2 + + flowmap = Flowmap( + source_flows=[], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [match1, match2] + + result = flowmap.cardinalities() + + assert len(result) == 2 + assert all(r["cardinality"] == "1:N" for r in result) + + def test_cardinalities_n_to_1(self): + """Test cardinalities for N:1 relationships.""" + source_flow1 = Mock(spec=Flow) + source_flow1._id = 1 + source_flow2 = Mock(spec=Flow) + source_flow2._id = 2 + target_flow = Mock(spec=Flow) + target_flow._id = 3 + + match1 = Mock(spec=Match) + match1.source = source_flow1 + match1.target = target_flow + match2 = Mock(spec=Match) + match2.source = source_flow2 + match2.target = target_flow + + flowmap = Flowmap( + source_flows=[], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [match1, match2] + + result = flowmap.cardinalities() + + assert len(result) == 2 + assert all(r["cardinality"] == "N:1" for r in result) + + def test_cardinalities_n_to_m(self): + """Test cardinalities for N:M relationships.""" + source_flow1 = Mock(spec=Flow) + source_flow1._id = 1 + source_flow2 = Mock(spec=Flow) + source_flow2._id = 2 + target_flow1 = Mock(spec=Flow) + target_flow1._id = 3 + target_flow2 = Mock(spec=Flow) + target_flow2._id = 4 + + match1 = Mock(spec=Match) + match1.source = source_flow1 + match1.target = target_flow1 + match2 = Mock(spec=Match) + match2.source = source_flow1 + match2.target = target_flow2 + match3 = Mock(spec=Match) + match3.source = source_flow2 + match3.target = target_flow1 + match4 = Mock(spec=Match) + match4.source = source_flow2 + match4.target = target_flow2 + + flowmap = Flowmap( + source_flows=[], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [match1, match2, match3, match4] + + result = flowmap.cardinalities() + + assert len(result) == 4 + assert all(r["cardinality"] == "N:M" for r in result) + + def test_cardinalities_sorted_by_from(self): + """Test that cardinalities are sorted by source ID.""" + matches = [] + for i in range(5, 0, -1): # Reverse order + source_flow = Mock(spec=Flow) + source_flow._id = i + target_flow = Mock(spec=Flow) + target_flow._id = i + 10 + match = Mock(spec=Match) + match.source = source_flow + match.target = target_flow + matches.append(match) + + flowmap = Flowmap( + source_flows=[], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = matches + + result = flowmap.cardinalities() + + # Verify sorted by 'from' (source ID) + from_ids = [r["from"] for r in result] + assert from_ids == sorted(from_ids) + + +class TestFlowmapToRandonneur: + """Test Flowmap to_randonneur method.""" + + @patch("flowmapper.flowmap.randonneur.Datapackage") + def test_to_randonneur_creates_datapackage(self, mock_datapackage_class): + """Test that to_randonneur creates a Datapackage.""" + mock_dp = Mock() + mock_datapackage_class.return_value = mock_dp + + match = Mock(spec=Match) + match.export.return_value = {"source": "test"} + + flowmap = Flowmap( + source_flows=[], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [match] + + result = flowmap.to_randonneur( + source_id="source", + target_id="target", + contributors=[], + mapping_source={}, + mapping_target={}, + ) + + # Verify Datapackage was created + mock_datapackage_class.assert_called_once() + assert result == mock_dp + + @patch("flowmapper.flowmap.randonneur.Datapackage") + def test_to_randonneur_adds_match_data(self, mock_datapackage_class): + """Test that to_randonneur adds match data to datapackage.""" + mock_dp = Mock() + mock_datapackage_class.return_value = mock_dp + + match1 = Mock(spec=Match) + match1.export.return_value = {"source": "test1"} + match2 = Mock(spec=Match) + match2.export.return_value = {"source": "test2"} + + flowmap = Flowmap( + source_flows=[], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [match1, match2] + + flowmap.to_randonneur( + source_id="source", + target_id="target", + contributors=[], + mapping_source={}, + mapping_target={}, + ) + + # Verify add_data was called with exported matches + mock_dp.add_data.assert_called_once() + call_args = mock_dp.add_data.call_args + assert call_args.kwargs["verb"] == "update" + assert len(call_args.kwargs["data"]) == 2 + + @patch("flowmapper.flowmap.randonneur.Datapackage") + def test_to_randonneur_saves_to_path(self, mock_datapackage_class): + """Test that to_randonneur saves to path if provided.""" + mock_dp = Mock() + mock_datapackage_class.return_value = mock_dp + + flowmap = Flowmap( + source_flows=[], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [] + + test_path = Path("/tmp/test.json") + + flowmap.to_randonneur( + source_id="source", + target_id="target", + contributors=[], + mapping_source={}, + mapping_target={}, + path=test_path, + ) + + # Verify to_json was called + mock_dp.to_json.assert_called_once_with(test_path) + + @patch("flowmapper.flowmap.randonneur.Datapackage") + def test_to_randonneur_uses_custom_name(self, mock_datapackage_class): + """Test that to_randonneur uses custom name if provided.""" + mock_dp = Mock() + mock_datapackage_class.return_value = mock_dp + + flowmap = Flowmap( + source_flows=[], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [] + + flowmap.to_randonneur( + source_id="source", + target_id="target", + contributors=[], + mapping_source={}, + mapping_target={}, + name="custom-name", + ) + + # Verify name was used + call_args = mock_datapackage_class.call_args + assert call_args.kwargs["name"] == "custom-name" + + @patch("flowmapper.flowmap.randonneur.Datapackage") + def test_to_randonneur_defaults_name(self, mock_datapackage_class): + """Test that to_randonneur defaults name to source-target.""" + mock_dp = Mock() + mock_datapackage_class.return_value = mock_dp + + flowmap = Flowmap( + source_flows=[], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [] + + flowmap.to_randonneur( + source_id="source-v1", + target_id="target-v2", + contributors=[], + mapping_source={}, + mapping_target={}, + ) + + # Verify default name was used + call_args = mock_datapackage_class.call_args + assert call_args.kwargs["name"] == "source-v1-target-v2" + + +class TestFlowmapToGlad: + """Test Flowmap to_glad method.""" + + def test_to_glad_creates_dataframe(self): + """Test that to_glad returns a DataFrame.""" + # Create match with all required attributes + source_name = Mock() + source_name.__str__ = Mock(return_value="Source Flow") + source_context = Mock() + source_context.export_as_string.return_value = "air" + source_unit = Mock() + source_unit.__str__ = Mock(return_value="kg") + + source_flow = Mock(spec=Flow) + source_flow.name = source_name + source_flow.identifier = "source-uuid" + source_flow.context = source_context + source_flow.unit = source_unit + + target_name = Mock() + target_name.__str__ = Mock(return_value="Target Flow") + target_context = Mock() + target_context.export_as_string.return_value = "air" + target_unit = Mock() + target_unit.__str__ = Mock(return_value="kg") + + target_flow = Mock(spec=Flow) + target_flow.name = target_name + target_flow.identifier = "target-uuid" + target_flow.context = target_context + target_flow.unit = target_unit + + match_condition = Mock() + match_condition.as_glad.return_value = "exact" + + match = Mock(spec=Match) + match.source = source_flow + match.target = target_flow + match.condition = match_condition + match.conversion_factor = 1.0 + match.comment = "Test match" + + flowmap = Flowmap( + source_flows=[], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [match] + + result = flowmap.to_glad() + + assert isinstance(result, pd.DataFrame) + assert len(result) == 1 + assert result.iloc[0]["SourceFlowName"] == "Source Flow" + assert result.iloc[0]["TargetFlowName"] == "Target Flow" + + def test_to_glad_includes_all_columns(self): + """Test that to_glad includes all required GLAD columns.""" + source_name = Mock() + source_name.__str__ = Mock(return_value="Source") + source_context = Mock() + source_context.export_as_string.return_value = "air" + source_unit = Mock() + source_unit.__str__ = Mock(return_value="kg") + + source_flow = Mock(spec=Flow) + source_flow.name = source_name + source_flow.identifier = "source-id" + source_flow.context = source_context + source_flow.unit = source_unit + + target_name = Mock() + target_name.__str__ = Mock(return_value="Target") + target_context = Mock() + target_context.export_as_string.return_value = "air" + target_unit = Mock() + target_unit.__str__ = Mock(return_value="kg") + + target_flow = Mock(spec=Flow) + target_flow.name = target_name + target_flow.identifier = "target-id" + target_flow.context = target_context + target_flow.unit = target_unit + + match_condition = Mock() + match_condition.as_glad.return_value = "exact" + + match = Mock(spec=Match) + match.source = source_flow + match.target = target_flow + match.condition = match_condition + match.conversion_factor = 1.0 + match.comment = "Comment" + + flowmap = Flowmap( + source_flows=[], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [match] + + result = flowmap.to_glad() + + expected_columns = [ + "SourceFlowName", + "SourceFlowUUID", + "SourceFlowContext", + "SourceUnit", + "MatchCondition", + "ConversionFactor", + "TargetFlowName", + "TargetFlowUUID", + "TargetFlowContext", + "TargetUnit", + "MemoMapper", + ] + assert all(col in result.columns for col in expected_columns) + + def test_to_glad_ensure_id_replaces_none_with_empty_string(self): + """Test that to_glad replaces None identifiers with empty string when ensure_id=True.""" + source_name = Mock() + source_name.__str__ = Mock(return_value="Source") + source_context = Mock() + source_context.export_as_string.return_value = "air" + source_unit = Mock() + source_unit.__str__ = Mock(return_value="kg") + + source_flow = Mock(spec=Flow) + source_flow.name = source_name + source_flow.identifier = None + source_flow.context = source_context + source_flow.unit = source_unit + + target_name = Mock() + target_name.__str__ = Mock(return_value="Target") + target_context = Mock() + target_context.export_as_string.return_value = "air" + target_unit = Mock() + target_unit.__str__ = Mock(return_value="kg") + + target_flow = Mock(spec=Flow) + target_flow.name = target_name + target_flow.identifier = None + target_flow.context = target_context + target_flow.unit = target_unit + + match_condition = Mock() + match_condition.as_glad.return_value = "exact" + + match = Mock(spec=Match) + match.source = source_flow + match.target = target_flow + match.condition = match_condition + match.conversion_factor = 1.0 + match.comment = "Comment" + + flowmap = Flowmap( + source_flows=[], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [match] + + result = flowmap.to_glad(ensure_id=True) + + assert result.iloc[0]["SourceFlowUUID"] == "" + assert result.iloc[0]["TargetFlowUUID"] == "" + + def test_to_glad_ensure_id_false_keeps_none(self): + """Test that to_glad keeps None identifiers when ensure_id=False.""" + source_name = Mock() + source_name.__str__ = Mock(return_value="Source") + source_context = Mock() + source_context.export_as_string.return_value = "air" + source_unit = Mock() + source_unit.__str__ = Mock(return_value="kg") + + source_flow = Mock(spec=Flow) + source_flow.name = source_name + source_flow.identifier = None + source_flow.context = source_context + source_flow.unit = source_unit + + target_name = Mock() + target_name.__str__ = Mock(return_value="Target") + target_context = Mock() + target_context.export_as_string.return_value = "air" + target_unit = Mock() + target_unit.__str__ = Mock(return_value="kg") + + target_flow = Mock(spec=Flow) + target_flow.name = target_name + target_flow.identifier = None + target_flow.context = target_context + target_flow.unit = target_unit + + match_condition = Mock() + match_condition.as_glad.return_value = "exact" + + match = Mock(spec=Match) + match.source = source_flow + match.target = target_flow + match.condition = match_condition + match.conversion_factor = 1.0 + match.comment = "Comment" + + flowmap = Flowmap( + source_flows=[], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [match] + + result = flowmap.to_glad(ensure_id=False) + + assert pd.isna(result.iloc[0]["SourceFlowUUID"]) + assert pd.isna(result.iloc[0]["TargetFlowUUID"]) + + def test_to_glad_missing_source_includes_unmatched(self): + """Test that to_glad includes unmatched source flows when missing_source=True.""" + # Create unmatched source flow + unmatched_name = Mock() + unmatched_name.__str__ = Mock(return_value="Unmatched") + unmatched_context = Mock() + unmatched_context.export_as_string.return_value = "air" + unmatched_unit = Mock() + unmatched_unit.__str__ = Mock(return_value="kg") + + unmatched_original = Mock(spec=Flow) + unmatched_original.name = unmatched_name + unmatched_original.identifier = "unmatched-id" + unmatched_original.context = unmatched_context + unmatched_original.unit = unmatched_unit + + unmatched_flow = Mock(spec=NormalizedFlow) + unmatched_flow.matched = False + unmatched_flow.original = unmatched_original + + flowmap = Flowmap( + source_flows=[unmatched_flow], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [] + + result = flowmap.to_glad(missing_source=True) + + assert len(result) == 1 + assert result.iloc[0]["SourceFlowName"] == "Unmatched" + # Unmatched flows only have source columns, target columns will be NaN + # The DataFrame will have all columns but target values will be NaN + if "TargetFlowName" in result.columns: + assert pd.isna(result.iloc[0]["TargetFlowName"]) + + @patch("flowmapper.flowmap.Path") + def test_to_glad_saves_to_excel(self, mock_path_class): + """Test that to_glad saves to Excel when path is provided.""" + import os + import tempfile + from pathlib import Path as RealPath + + # Use a temporary file that we can actually create + with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp: + test_path = RealPath(tmp.name) + + try: + mock_path_class.return_value = test_path + + source_name = Mock() + source_name.__str__ = Mock(return_value="Source") + source_context = Mock() + source_context.export_as_string.return_value = "air" + source_unit = Mock() + source_unit.__str__ = Mock(return_value="kg") + + source_flow = Mock(spec=Flow) + source_flow.name = source_name + source_flow.identifier = "source-id" + source_flow.context = source_context + source_flow.unit = source_unit + + target_name = Mock() + target_name.__str__ = Mock(return_value="Target") + target_context = Mock() + target_context.export_as_string.return_value = "air" + target_unit = Mock() + target_unit.__str__ = Mock(return_value="kg") + + target_flow = Mock(spec=Flow) + target_flow.name = target_name + target_flow.identifier = "target-id" + target_flow.context = target_context + target_flow.unit = target_unit + + match_condition = Mock() + match_condition.as_glad.return_value = "exact" + + match = Mock(spec=Match) + match.source = source_flow + match.target = target_flow + match.condition = match_condition + match.conversion_factor = 1.0 + match.comment = "Comment" + + flowmap = Flowmap( + source_flows=[], + target_flows=[], + data_preparation_functions=[], + ) + flowmap.matches = [match] + + result = flowmap.to_glad(path=test_path) + + # Verify path was converted to Path + mock_path_class.assert_called_once_with(test_path) + + # Verify file was created + assert test_path.exists() + finally: + # Clean up + if test_path.exists(): + os.unlink(test_path) diff --git a/tests/unit/test_oxidation_state.py b/tests/unit/test_oxidation_state.py new file mode 100644 index 0000000..85cd5b8 --- /dev/null +++ b/tests/unit/test_oxidation_state.py @@ -0,0 +1,433 @@ +"""Unit tests for OxidationState class.""" + +import pytest + +from flowmapper.fields import OxidationState + + +class TestOxidationStateInitialization: + """Test OxidationState initialization.""" + + def test_init_with_positive_value(self): + """Test initialization with positive value.""" + os = OxidationState(3) + assert os.value == 3, f"Expected os.value to be 3, but got {os.value}" + + def test_init_with_negative_value(self): + """Test initialization with negative value.""" + os = OxidationState(-2) + assert os.value == -2, f"Expected os.value to be -2, but got {os.value}" + + def test_init_with_zero(self): + """Test initialization with zero.""" + os = OxidationState(0) + assert os.value == 0, f"Expected os.value to be 0, but got {os.value}" + + def test_init_with_boundary_values(self): + """Test initialization with boundary values.""" + os_min = OxidationState(-5) + os_max = OxidationState(9) + assert ( + os_min.value == -5 + ), f"Expected os_min.value to be -5, but got {os_min.value}" + assert ( + os_max.value == 9 + ), f"Expected os_max.value to be 9, but got {os_max.value}" + + +class TestOxidationStateEq: + """Test OxidationState __eq__ method.""" + + def test_eq_with_same_oxidation_state(self): + """Test equality with same OxidationState instance.""" + os1 = OxidationState(3) + os2 = OxidationState(3) + assert ( + os1 == os2 + ), f"Expected os1 to equal os2, but they are not equal (os1={os1.value}, os2={os2.value})" + + def test_eq_with_different_oxidation_state(self): + """Test equality with different OxidationState.""" + os1 = OxidationState(3) + os2 = OxidationState(4) + assert ( + os1 != os2 + ), f"Expected os1 to not equal os2, but they are equal (os1={os1.value}, os2={os2.value})" + + def test_eq_with_integer(self): + """Test equality with integer.""" + os = OxidationState(3) + assert ( + os == 3 + ), f"Expected os to equal 3, but they are not equal (os={os.value})" + assert ( + os != 4 + ), f"Expected os to not equal 4, but they are equal (os={os.value})" + + def test_eq_with_negative_integer(self): + """Test equality with negative integer.""" + os = OxidationState(-2) + assert ( + os == -2 + ), f"Expected os to equal -2, but they are not equal (os={os.value})" + assert ( + os != -3 + ), f"Expected os to not equal -3, but they are equal (os={os.value})" + + def test_eq_with_zero(self): + """Test equality with zero.""" + os = OxidationState(0) + assert ( + os == 0 + ), f"Expected os to equal 0, but they are not equal (os={os.value})" + assert ( + os != 1 + ), f"Expected os to not equal 1, but they are equal (os={os.value})" + + +class TestOxidationStateHasOxidationState: + """Test OxidationState has_oxidation_state static method.""" + + def test_has_oxidation_state_with_roman_numeral_lowercase(self): + """Test has_oxidation_state with lowercase roman numeral.""" + assert OxidationState.has_oxidation_state( + "chromium (iii)" + ), "Expected has_oxidation_state('chromium (iii)') to return True, but it returned False" + assert OxidationState.has_oxidation_state( + "iron (ii)" + ), "Expected has_oxidation_state('iron (ii)') to return True, but it returned False" + assert OxidationState.has_oxidation_state( + "manganese (vi)" + ), "Expected has_oxidation_state('manganese (vi)') to return True, but it returned False" + + def test_has_oxidation_state_with_roman_numeral_uppercase(self): + """Test has_oxidation_state with uppercase roman numeral.""" + assert OxidationState.has_oxidation_state( + "Iron (II)" + ), "Expected has_oxidation_state('Iron (II)') to return True, but it returned False" + assert OxidationState.has_oxidation_state( + "Chromium (III)" + ), "Expected has_oxidation_state('Chromium (III)') to return True, but it returned False" + assert OxidationState.has_oxidation_state( + "Mercury (IV)" + ), "Expected has_oxidation_state('Mercury (IV)') to return True, but it returned False" + + def test_has_oxidation_state_with_roman_numeral_no_parentheses(self): + """Test has_oxidation_state with roman numeral without parentheses.""" + assert OxidationState.has_oxidation_state( + "chromium iii" + ), "Expected has_oxidation_state('chromium iii') to return True, but it returned False" + assert OxidationState.has_oxidation_state( + "iron II" + ), "Expected has_oxidation_state('iron II') to return True, but it returned False" + + def test_has_oxidation_state_with_number(self): + """Test has_oxidation_state with number.""" + # The new regex requires a sign before the number + assert OxidationState.has_oxidation_state( + "iron (+2)" + ), "Expected has_oxidation_state('iron (+2)') to return True, but it returned False" + assert OxidationState.has_oxidation_state( + "iron (-2)" + ), "Expected has_oxidation_state('iron (-2)') to return True, but it returned False" + # Numbers without signs or with signs after no longer match + assert not OxidationState.has_oxidation_state( + "iron (2)" + ), "Expected has_oxidation_state('iron (2)') to return False (no sign), but it returned True" + assert not OxidationState.has_oxidation_state( + "iron (3+)" + ), "Expected has_oxidation_state('iron (3+)') to return False (sign after), but it returned True" + assert not OxidationState.has_oxidation_state( + "iron (2-)" + ), "Expected has_oxidation_state('iron (2-)') to return False (sign after), but it returned True" + + def test_has_oxidation_state_with_number_no_parentheses(self): + """Test has_oxidation_state with number without parentheses.""" + # The new regex requires a sign before the number + assert OxidationState.has_oxidation_state( + "iron +3" + ), "Expected has_oxidation_state('iron +3') to return True, but it returned False" + assert OxidationState.has_oxidation_state( + "iron -2" + ), "Expected has_oxidation_state('iron -2') to return True, but it returned False" + # Numbers without signs or with signs after no longer match + assert not OxidationState.has_oxidation_state( + "iron 2" + ), "Expected has_oxidation_state('iron 2') to return False (no sign), but it returned True" + assert not OxidationState.has_oxidation_state( + "iron 2-" + ), "Expected has_oxidation_state('iron 2-') to return False (sign after), but it returned True" + assert not OxidationState.has_oxidation_state( + "iron 02-" + ), "Expected has_oxidation_state('iron 02-') to return False (sign after), but it returned True" + + def test_has_oxidation_state_without_oxidation_state(self): + """Test has_oxidation_state without oxidation state.""" + assert not OxidationState.has_oxidation_state( + "water" + ), "Expected has_oxidation_state('water') to return False, but it returned True" + assert not OxidationState.has_oxidation_state( + "iron" + ), "Expected has_oxidation_state('iron') to return False, but it returned True" + assert not OxidationState.has_oxidation_state( + "chromium oxide" + ), "Expected has_oxidation_state('chromium oxide') to return False, but it returned True" + + def test_has_oxidation_state_with_compound_identifier(self): + """Test has_oxidation_state should not match numbers in compound identifiers.""" + assert not OxidationState.has_oxidation_state( + "Ethane,, 1,1,2-trichloro-1,2,2-trifluoro-, CFC-113" + ), "Expected has_oxidation_state('Ethane,, 1,1,2-trichloro-1,2,2-trifluoro-, CFC-113') to return False, but it returned True" + + def test_has_oxidation_state_should_not_match_roman_numeral_in_word(self): + """Test has_oxidation_state should not match roman numerals embedded in words.""" + assert not OxidationState.has_oxidation_state( + "Bifenox" + ), "Expected has_oxidation_state('Bifenox') to return False, but it returned True" + + def test_has_oxidation_state_with_comma(self): + """Test has_oxidation_state with comma before oxidation state.""" + assert OxidationState.has_oxidation_state( + "iron, (II)" + ), "Expected has_oxidation_state('iron, (II)') to return True, but it returned False" + # The new regex requires a sign before the number + assert OxidationState.has_oxidation_state( + "iron, (+2)" + ), "Expected has_oxidation_state('iron, (+2)') to return True, but it returned False" + + +class TestOxidationStateFromString: + """Test OxidationState from_string class method.""" + + def test_from_string_with_roman_numeral_lowercase(self): + """Test from_string with lowercase roman numeral.""" + os, remaining = OxidationState.from_string("chromium (iii)") + assert os.value == 3, f"Expected os.value to be 3, but got {os.value}" + assert ( + remaining == "chromium" + ), f"Expected remaining to be 'chromium', but got {remaining!r}" + + def test_from_string_with_roman_numeral_uppercase(self): + """Test from_string with uppercase roman numeral.""" + os, remaining = OxidationState.from_string("Iron (II)") + assert os.value == 2, f"Expected os.value to be 2, but got {os.value}" + assert ( + remaining == "Iron" + ), f"Expected remaining to be 'Iron', but got {remaining!r}" + + def test_from_string_with_roman_numeral_no_parentheses(self): + """Test from_string with roman numeral without parentheses.""" + os, remaining = OxidationState.from_string("chromium iii") + assert os.value == 3, f"Expected os.value to be 3, but got {os.value}" + assert ( + remaining == "chromium" + ), f"Expected remaining to be 'chromium', but got {remaining!r}" + + def test_from_string_with_roman_numeral_negative(self): + """Test from_string with negative roman numeral.""" + os, remaining = OxidationState.from_string("iron (II-)") + assert os.value == -2, f"Expected os.value to be -2, but got {os.value}" + assert ( + remaining == "iron" + ), f"Expected remaining to be 'iron', but got {remaining!r}" + + def test_from_string_with_roman_numeral_positive_sign(self): + """Test from_string with positive sign in roman numeral.""" + os, remaining = OxidationState.from_string("iron (II+)") + assert os.value == 2, f"Expected os.value to be 2, but got {os.value}" + assert ( + remaining == "iron" + ), f"Expected remaining to be 'iron', but got {remaining!r}" + + def test_from_string_with_number(self): + """Test from_string with number.""" + # The new regex requires a sign before the number + os, remaining = OxidationState.from_string("iron (+2)") + assert os.value == 2, f"Expected os.value to be 2, but got {os.value}" + assert ( + remaining == "iron" + ), f"Expected remaining to be 'iron', but got {remaining!r}" + + def test_from_string_with_number_positive(self): + """Test from_string with positive number.""" + # The new regex requires a sign before the number + os, remaining = OxidationState.from_string("iron (+3)") + assert os.value == 3, f"Expected os.value to be 3, but got {os.value}" + assert ( + remaining == "iron" + ), f"Expected remaining to be 'iron', but got {remaining!r}" + + def test_from_string_with_number_negative(self): + """Test from_string with negative number.""" + # The new regex requires a sign before the number + os, remaining = OxidationState.from_string("iron (-2)") + assert os.value == -2, f"Expected os.value to be -2, but got {os.value}" + assert ( + remaining == "iron" + ), f"Expected remaining to be 'iron', but got {remaining!r}" + + def test_from_string_with_number_no_parentheses(self): + """Test from_string with number without parentheses.""" + # The new regex requires a sign before the number + os, remaining = OxidationState.from_string("iron +2") + assert os.value == 2, f"Expected os.value to be 2, but got {os.value}" + assert ( + remaining == "iron" + ), f"Expected remaining to be 'iron', but got {remaining!r}" + + def test_from_string_with_number_sign_before(self): + """Test from_string with sign before number.""" + os, remaining = OxidationState.from_string("iron +3") + assert os.value == 3, f"Expected os.value to be 3, but got {os.value}" + assert ( + remaining == "iron" + ), f"Expected remaining to be 'iron', but got {remaining!r}" + + def test_from_string_with_number_sign_before_negative(self): + """Test from_string with negative sign before number.""" + os, remaining = OxidationState.from_string("iron -2") + assert os.value == -2, f"Expected os.value to be -2, but got {os.value}" + assert ( + remaining == "iron" + ), f"Expected remaining to be 'iron', but got {remaining!r}" + + def test_from_string_with_comma(self): + """Test from_string with comma before oxidation state.""" + os, remaining = OxidationState.from_string("iron, (II)") + assert os.value == 2, f"Expected os.value to be 2, but got {os.value}" + assert ( + remaining == "iron" + ), f"Expected remaining to be 'iron', but got {remaining!r}" + + def test_from_string_with_comma_and_leading_zeros(self): + """Test from_string with comma and number with leading zeros.""" + os, remaining = OxidationState.from_string("foo, +002") + assert os.value == 2, f"Expected os.value to be 2, but got {os.value}" + assert ( + remaining == "foo" + ), f"Expected remaining to be 'foo', but got {remaining!r}" + + def test_from_string_with_whitespace(self): + """Test from_string with whitespace around oxidation state.""" + os, remaining = OxidationState.from_string("iron ( II )") + assert os.value == 2, f"Expected os.value to be 2, but got {os.value}" + assert ( + remaining == "iron" + ), f"Expected remaining to be 'iron', but got {remaining!r}" + + def test_from_string_raises_error_invalid_roman_numeral(self): + """Test from_string raises error for invalid roman numeral.""" + with pytest.raises(ValueError, match="is not a valid roman numeral"): + OxidationState.from_string("iron (IIII)") + + # Test various invalid roman numerals + invalid_cases = [ + "iron (IIII)", # Four I's in a row + "iron (VV)", # Two V's + "iron (VX)", # Invalid subtraction + ] + for invalid_case in invalid_cases: + with pytest.raises(ValueError, match="is not a valid roman numeral"): + OxidationState.from_string(invalid_case) + + def test_from_string_raises_error_both_signs(self): + """Test from_string raises error when both signs are present.""" + # The new regex only matches signs before the number, so "iron (+2-)" won't match + with pytest.raises(ValueError, match="No match found"): + OxidationState.from_string("iron (+2-)") + + def test_from_string_raises_error_no_match(self): + """Test from_string raises error when no match is found.""" + with pytest.raises(ValueError, match="No match found"): + OxidationState.from_string("iron") + with pytest.raises(ValueError, match="No match found"): + OxidationState.from_string( + "Ethane,, 1,1,2-trichloro-1,2,2-trifluoro-, CFC-113" + ) + with pytest.raises(ValueError, match="No match found"): + OxidationState.from_string("Bifenox") + + def test_from_string_raises_error_too_low(self): + """Test from_string raises error for value too low.""" + with pytest.raises(ValueError, match="outside physical bounds"): + OxidationState.from_string("iron (-6)") + + def test_from_string_raises_error_too_high(self): + """Test from_string raises error for value too high.""" + with pytest.raises(ValueError, match="outside physical bounds"): + OxidationState.from_string("iron (+10)") + + def test_from_string_raises_error_values_outside_bounds_roman(self): + """Test from_string raises error for roman numeral values outside bounds.""" + # Test values too low + with pytest.raises(ValueError, match="outside physical bounds"): + OxidationState.from_string("iron (VI-)") # -6 + + # Test values too high + with pytest.raises(ValueError, match="outside physical bounds"): + OxidationState.from_string("iron (X)") # 10 + with pytest.raises(ValueError, match="outside physical bounds"): + OxidationState.from_string("iron (XI)") # 11 + + def test_from_string_raises_error_values_outside_bounds_numbers(self): + """Test from_string raises error for number values outside bounds.""" + # Test values too low + with pytest.raises(ValueError, match="outside physical bounds"): + OxidationState.from_string("iron (-6)") + with pytest.raises(ValueError, match="outside physical bounds"): + OxidationState.from_string("iron (-10)") + + # Test values too high + with pytest.raises(ValueError, match="outside physical bounds"): + OxidationState.from_string("iron (+10)") + with pytest.raises(ValueError, match="outside physical bounds"): + OxidationState.from_string("iron (+15)") + + def test_from_string_boundary_values(self): + """Test from_string with boundary values.""" + os_min, remaining = OxidationState.from_string("iron (-5)") + assert ( + os_min.value == -5 + ), f"Expected os_min.value to be -5, but got {os_min.value}" + + # The new regex requires a sign before the number + os_max, remaining = OxidationState.from_string("iron (+9)") + assert ( + os_max.value == 9 + ), f"Expected os_max.value to be 9, but got {os_max.value}" + + def test_from_string_various_roman_numerals(self): + """Test from_string with various roman numerals.""" + test_cases = [ + ("iron (i)", 1), + ("iron (ii)", 2), + ("iron (iii)", 3), + ("iron (iv)", 4), + ("iron (v)", 5), + ("iron (vi)", 6), + ("iron (vii)", 7), + ("iron (viii)", 8), + ("iron (ix)", 9), + ] + for string, expected_value in test_cases: + os, remaining = OxidationState.from_string(string) + assert ( + os.value == expected_value + ), f"Expected os.value to be {expected_value} for '{string}', but got {os.value}" + + def test_from_string_remaining_string(self): + """Test from_string returns correct remaining string.""" + test_cases = [ + ("chromium (iii)", "chromium"), + ("iron (II)", "iron"), + ("manganese (vi)", "manganese"), + # The new regex requires a sign before the number + ("mercury (+2)", "mercury"), + ("tin (+3)", "tin"), + ("beryllium (-2)", "beryllium"), + ] + for string, expected_remaining in test_cases: + os, remaining = OxidationState.from_string(string) + assert ( + remaining == expected_remaining + ), f"Expected remaining to be {expected_remaining!r} for '{string}', but got {remaining!r}" diff --git a/tests/unit/test_randonneur.py b/tests/unit/test_randonneur.py new file mode 100644 index 0000000..26c8ae5 --- /dev/null +++ b/tests/unit/test_randonneur.py @@ -0,0 +1,283 @@ +"""Unit tests for randonneur-based transformation utilities.""" + +from flowmapper.domain.flow import Flow +from flowmapper.domain.normalized_flow import NormalizedFlow +from flowmapper.utils import apply_transformation_and_convert_flows_to_normalized_flows + + +class TestApplyGenericTransformationsToFlows: + """Test apply_transformation_and_convert_flows_to_normalized_flows function.""" + + def test_basic_transformation_single_function(self): + """Test basic transformation with a single function.""" + flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + def transform_func(graph): + # Modify the name in the dict + result = [] + for flow_dict in graph: + modified = flow_dict.copy() + modified["name"] = "Modified name" + result.append(modified) + return result + + result = apply_transformation_and_convert_flows_to_normalized_flows( + functions=[transform_func], flows=[flow] + ) + + assert len(result) == 1, "Expected one NormalizedFlow" + assert isinstance(result[0], NormalizedFlow), "Expected NormalizedFlow object" + assert result[0].original == flow, "Expected original flow to be preserved" + assert ( + result[0].normalized.name.data == "modified name" + ), "Expected normalized name to be transformed and normalized" + assert ( + result[0].current.name.data == "modified name" + ), "Expected current to match normalized" + + def test_multiple_transformations_sequential(self): + """Test that multiple transformations are applied sequentially.""" + flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + def transform_name(graph): + result = [] + for flow_dict in graph: + modified = flow_dict.copy() + modified["name"] = "First transformation" + result.append(modified) + return result + + def transform_unit(graph): + result = [] + for flow_dict in graph: + modified = flow_dict.copy() + modified["unit"] = "g" + result.append(modified) + return result + + result = apply_transformation_and_convert_flows_to_normalized_flows( + functions=[transform_name, transform_unit], flows=[flow] + ) + + assert len(result) == 1, "Expected one NormalizedFlow" + # Both transformations should be applied + assert ( + result[0].normalized.name.data == "first transformation" + ), "Expected name to be transformed by first function" + assert ( + result[0].original.unit.data == "kg" + ), "Expected original unit to be preserved as `kg`" + assert ( + result[0].normalized.unit.data == "gram" + ), "Expected unit to be transformed by second function and normalized from `g` to `gram`" + + def test_empty_functions_list(self): + """Test with empty list of functions (no transformations).""" + flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + result = apply_transformation_and_convert_flows_to_normalized_flows( + functions=[], flows=[flow] + ) + + assert len(result) == 1, "Expected one NormalizedFlow" + assert result[0].original == flow, "Expected original flow to be preserved" + # Without transformations, normalized should be the same as flow.normalize() + expected_normalized = flow.normalize() + assert ( + result[0].normalized.name.data == expected_normalized.name.data + ), "Expected normalized to match flow.normalize()" + + def test_empty_flows_list(self): + """Test with empty list of flows.""" + + def transform_func(graph): + return graph + + result = apply_transformation_and_convert_flows_to_normalized_flows( + functions=[transform_func], flows=[] + ) + + assert len(result) == 0, "Expected empty list" + + def test_multiple_flows(self): + """Test transformation of multiple flows.""" + flow1 = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + flow2 = Flow.from_dict({"name": "Water", "context": "water", "unit": "kg"}) + + def transform_func(graph): + result = [] + for flow_dict in graph: + modified = flow_dict.copy() + modified["name"] = f"Modified {flow_dict['name']}" + result.append(modified) + return result + + result = apply_transformation_and_convert_flows_to_normalized_flows( + functions=[transform_func], flows=[flow1, flow2] + ) + + assert len(result) == 2, "Expected two NormalizedFlow objects" + assert ( + result[0].original == flow1 + ), "Expected first original flow to be preserved" + assert ( + result[1].original == flow2 + ), "Expected second original flow to be preserved" + assert ( + "modified carbon dioxide" in result[0].normalized.name.data.lower() + ), "Expected first flow name to be transformed" + assert ( + "modified water" in result[1].normalized.name.data.lower() + ), "Expected second flow name to be transformed" + + def test_transformation_modifies_context(self): + """Test transformation that modifies context.""" + flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + def transform_context(graph): + result = [] + for flow_dict in graph: + modified = flow_dict.copy() + modified["context"] = ("emissions", "to air") + result.append(modified) + return result + + result = apply_transformation_and_convert_flows_to_normalized_flows( + functions=[transform_context], flows=[flow] + ) + + assert len(result) == 1, "Expected one NormalizedFlow" + # Context should be transformed and normalized + assert isinstance( + result[0].normalized.context.value, tuple + ), "Expected context to be tuple" + assert ( + "emissions" in result[0].normalized.context.value + ), "Expected transformed context to be present" + + def test_transformation_modifies_multiple_fields(self): + """Test transformation that modifies multiple fields at once.""" + flow = Flow.from_dict( + { + "name": "Carbon dioxide", + "context": "air", + "unit": "kg", + "location": "US", + } + ) + + def transform_multiple(graph): + result = [] + for flow_dict in graph: + modified = flow_dict.copy() + modified["name"] = "CO2" + modified["unit"] = "g" + modified["location"] = "CA" + result.append(modified) + return result + + result = apply_transformation_and_convert_flows_to_normalized_flows( + functions=[transform_multiple], flows=[flow] + ) + + assert len(result) == 1, "Expected one NormalizedFlow" + assert ( + result[0].normalized.name.data == "co2" + ), "Expected name to be transformed" + assert ( + result[0].normalized.unit.data == "gram" + ), "Expected unit to be transformed to `g` and normalized to `gram`" + assert ( + result[0].normalized.location == "CA" + ), "Expected location to be transformed" + + def test_original_flows_unchanged(self): + """Test that original Flow objects are not modified.""" + flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + original_name = flow.name.data + + def transform_func(graph): + result = [] + for flow_dict in graph: + modified = flow_dict.copy() + modified["name"] = "Modified name" + result.append(modified) + return result + + result = apply_transformation_and_convert_flows_to_normalized_flows( + functions=[transform_func], flows=[flow] + ) + + # Original flow should be unchanged + assert flow.name.data == original_name, "Expected original flow to be unchanged" + assert result[0].original == flow, "Expected original reference to be preserved" + + def test_current_is_copy_of_normalized(self): + """Test that current is a copy of normalized, not a reference.""" + flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + def transform_func(graph): + return graph # No transformation + + result = apply_transformation_and_convert_flows_to_normalized_flows( + functions=[transform_func], flows=[flow] + ) + + assert ( + result[0].current is not result[0].normalized + ), "Expected current to be a copy, not a reference" + assert ( + result[0].current.name.data == result[0].normalized.name.data + ), "Expected current to have same data as normalized" + + def test_transformation_chain_preserves_order(self): + """Test that transformations are applied in the correct order.""" + flow = Flow.from_dict( + {"name": "Carbon dioxide", "context": "air", "unit": "kg"} + ) + + call_order = [] + + def transform_first(graph): + call_order.append("first") + result = [] + for flow_dict in graph: + modified = flow_dict.copy() + modified["name"] = "First" + result.append(modified) + return result + + def transform_second(graph): + call_order.append("second") + result = [] + for flow_dict in graph: + modified = flow_dict.copy() + modified["name"] = f"{flow_dict['name']} then Second" + result.append(modified) + return result + + result = apply_transformation_and_convert_flows_to_normalized_flows( + functions=[transform_first, transform_second], flows=[flow] + ) + + assert call_order == [ + "first", + "second", + ], "Expected functions to be called in order" + assert ( + "second" in result[0].normalized.name.data.lower() + ), "Expected second transformation to be applied last" diff --git a/tests/unit/test_remove_unit_slash.py b/tests/unit/test_remove_unit_slash.py new file mode 100644 index 0000000..db488c4 --- /dev/null +++ b/tests/unit/test_remove_unit_slash.py @@ -0,0 +1,227 @@ +"""Unit tests for remove_unit_slash function.""" + +from unittest.mock import patch + +from flowmapper.domain.flow import Flow +from flowmapper.utils import remove_unit_slash + + +class TestRemoveUnitSlash: + """Test remove_unit_slash function.""" + + def test_no_match_returns_original_name(self): + """Test that remove_unit_slash returns original name when no match is found.""" + flow = Flow.from_dict({"name": "water", "unit": "kg", "context": "air"}) + + result = remove_unit_slash(flow) + assert result == "water", f"Expected result to be 'water', but got {result!r}" + + def test_match_at_end_removes_slash_and_unit(self): + """Test that remove_unit_slash removes /m3 or /kg when at end of string with whitespace.""" + # Test with /m3 at end with whitespace - unit is captured + flow = Flow.from_dict({"name": "water/m3 ", "unit": "m3", "context": "air"}) + + result = remove_unit_slash(flow) + # match.end() == len(name), so removes from match.start() to end + assert result == "water", f"Expected result to be 'water', but got {result!r}" + + # Test with /kg at end with whitespace + flow = Flow.from_dict({"name": "water/kg ", "unit": "kg", "context": "air"}) + result = remove_unit_slash(flow) + assert result == "water", f"Expected result to be 'water', but got {result!r}" + + def test_match_at_end_with_comma(self): + """Test that remove_unit_slash skips match with only comma after unit at end.""" + flow = Flow.from_dict({"name": "water/m3,", "unit": "m3", "context": "air"}) + + result = remove_unit_slash(flow) + assert ( + result == "water/m3," + ), f"Expected result to be 'water/m3,', but got {result!r}" + + def test_match_in_middle_replaces_with_comma_space(self): + """Test that remove_unit_slash replaces /m3 or /kg in middle with ', '.""" + flow = Flow.from_dict( + {"name": "water/m3, pure", "unit": "m3", "context": "air"} + ) + + result = remove_unit_slash(flow) + assert ( + result == "water, pure" + ), f"Expected result to be 'water, pure', but got {result!r}" + + # Test with /kg + flow = Flow.from_dict( + {"name": "water/kg, pure", "unit": "kg", "context": "air"} + ) + result = remove_unit_slash(flow) + assert ( + result == "water, pure" + ), f"Expected result to be 'water, pure', but got {result!r}" + + def test_match_with_whitespace(self): + """Test that remove_unit_slash handles whitespace after unit.""" + flow = Flow.from_dict({"name": "water/m3 ", "unit": "m3", "context": "air"}) + + result = remove_unit_slash(flow) + # match.end() == len(name) (whitespace is included in match), so removes from start to end + assert result == "water", f"Expected result to be 'water', but got {result!r}" + + def test_match_with_comma_and_whitespace(self): + """Test that remove_unit_slash handles comma and whitespace.""" + flow = Flow.from_dict( + {"name": "water/m3, pure", "unit": "m3", "context": "air"} + ) + + result = remove_unit_slash(flow) + assert ( + result == "water, pure" + ), f"Expected result to be 'water, pure', but got {result!r}" + + def test_multiple_matches_skipped(self): + """Test that remove_unit_slash only processes the first match.""" + # With the fixed regex, /kg at the end will match, so it removes /kg + flow = Flow.from_dict({"name": "water/m3/kg", "unit": "kg", "context": "air"}) + + result = remove_unit_slash(flow) + # The regex matches /kg at the end, so it removes /kg + assert ( + result == "water/m3" + ), f"Expected result to be 'water/m3' (removes /kg at end), but got {result!r}" + + def test_no_match_without_slash_and_unit(self): + """Test that remove_unit_slash doesn't match strings without slash and unit.""" + # This was the original bug - "Caesium I" should not match + flow = Flow.from_dict({"name": "Caesium I", "unit": "kg", "context": "air"}) + + result = remove_unit_slash(flow) + # Should not match because there's no /m3 or /kg + assert ( + result == "Caesium I" + ), f"Expected result to be 'Caesium I' (no match), but got {result!r}" + + @patch("flowmapper.utils.flow_names.logger") + def test_incompatible_unit_logs_warning(self, mock_logger): + """Test that remove_unit_slash logs warning for incompatible units.""" + # Create flow with m3 in name but kg as unit (incompatible) + flow = Flow.from_dict({"name": "water/m3 ", "unit": "kg", "context": "air"}) + + # Should still return the modified name + result = remove_unit_slash(flow) + assert result == "water", f"Expected result to be 'water', but got {result!r}" + # Verify warning was called + mock_logger.warning.assert_called_once() + + @patch("flowmapper.utils.flow_names.logger") + def test_incompatible_unit_logs_warning_message(self, mock_logger): + """Test that remove_unit_slash logs the correct warning message for incompatible units.""" + # Create flow with m3 in name but kg as unit (incompatible) + flow = Flow.from_dict({"name": "water/m3 pure", "unit": "kg", "context": "air"}) + + result = remove_unit_slash(flow) + assert ( + result == "water, pure" + ), f"Expected result to be 'water, pure', but got {result!r}" + + # Verify warning was called + mock_logger.warning.assert_called_once() + warning_call = mock_logger.warning.call_args[0][0] + assert ( + "has unit" in warning_call + ), f"Expected warning message to contain 'has unit', but got {warning_call!r}" + assert ( + "but name refers to incompatible unit" in warning_call + ), f"Expected warning message to contain 'but name refers to incompatible unit', but got {warning_call!r}" + assert ( + "m3" in warning_call + ), f"Expected warning message to contain 'm3', but got {warning_call!r}" + + @patch("flowmapper.utils.flow_names.logger") + def test_incompatible_unit_logs_warning_with_kg(self, mock_logger): + """Test that remove_unit_slash logs warning message with kg unit.""" + # Create flow with kg in name but m3 as unit (incompatible) + flow = Flow.from_dict({"name": "water/kg pure", "unit": "m3", "context": "air"}) + + result = remove_unit_slash(flow) + assert ( + result == "water, pure" + ), f"Expected result to be 'water, pure', but got {result!r}" + + # Verify warning was called with kg + mock_logger.warning.assert_called_once() + warning_call = mock_logger.warning.call_args[0][0] + assert ( + "kg" in warning_call + ), f"Expected warning message to contain 'kg', but got {warning_call!r}" + + @patch("flowmapper.utils.flow_names.logger") + def test_compatible_unit_no_warning(self, mock_logger): + """Test that remove_unit_slash doesn't log warning for compatible units.""" + # Create flow with m3 in name and m3 as unit (compatible) + flow = Flow.from_dict({"name": "water/m3 ", "unit": "m3", "context": "air"}) + + result = remove_unit_slash(flow) + assert result == "water", f"Expected result to be 'water', but got {result!r}" + # Verify warning was NOT called for compatible units + mock_logger.warning.assert_not_called() + + def test_match_when_unit_not_followed_by_whitespace_or_comma(self): + """Test that remove_unit_slash doesn't match when unit is not followed by whitespace or comma.""" + flow = Flow.from_dict({"name": "water/m3x", "unit": "m3", "context": "air"}) + + result = remove_unit_slash(flow) + # The regex requires whitespace, comma, or end of string after /m3 or /kg + # Since /m3x doesn't match, no change should occur + assert ( + result == "water/m3x" + ), f"Expected result to be 'water/m3x' (no match), but got {result!r}" + + def test_match_not_at_end_replaces(self): + """Test that remove_unit_slash replaces match when not at end.""" + flow = Flow.from_dict({"name": "water/m3 pure", "unit": "m3", "context": "air"}) + + result = remove_unit_slash(flow) + assert ( + result == "water, pure" + ), f"Expected result to be 'water, pure', but got {result!r}" + + def test_case_sensitivity(self): + """Test that remove_unit_slash is case-sensitive for unit pattern.""" + flow = Flow.from_dict( + {"name": "water/M3", "unit": "m3", "context": "air"} + ) # Uppercase M3 + + # Should not match uppercase M3 + result = remove_unit_slash(flow) + assert ( + result == "water/M3" + ), f"Expected result to be 'water/M3' (no match), but got {result!r}" + + def test_no_unit_slash_pattern(self): + """Test that remove_unit_slash doesn't match other slash patterns.""" + flow = Flow.from_dict({"name": "water/liter", "unit": "kg", "context": "air"}) + + result = remove_unit_slash(flow) + assert ( + result == "water/liter" + ), f"Expected result to be 'water/liter' (no match), but got {result!r}" + + def test_empty_name(self): + """Test that remove_unit_slash handles empty name.""" + flow = Flow.from_dict({"name": "", "unit": "kg", "context": "air"}) + + result = remove_unit_slash(flow) + assert result == "", f"Expected result to be '', but got {result!r}" + + def test_name_with_only_unit_slash(self): + """Test that remove_unit_slash handles name with only /m3 or /kg with whitespace.""" + flow = Flow.from_dict({"name": "/m3 ", "unit": "m3", "context": "air"}) + + result = remove_unit_slash(flow) + # match.end() == len(name), so removes from match.start() to end + assert result == "", f"Expected result to be '', but got {result!r}" + + # Test with /kg + flow = Flow.from_dict({"name": "/kg ", "unit": "kg", "context": "air"}) + result = remove_unit_slash(flow) + assert result == "", f"Expected result to be '', but got {result!r}" diff --git a/tests/unit/test_split_location_suffix.py b/tests/unit/test_split_location_suffix.py new file mode 100644 index 0000000..d6e542b --- /dev/null +++ b/tests/unit/test_split_location_suffix.py @@ -0,0 +1,246 @@ +"""Unit tests for split_location_suffix and replace_location_suffix functions.""" + +import pytest + +from flowmapper.errors import MissingLocation +from flowmapper.fields import replace_location_suffix, split_location_suffix + + +class TestSplitLocationSuffix: + """Test split_location_suffix function.""" + + def test_simple_location_code(self): + """Test split_location_suffix with simple location code.""" + name, location = split_location_suffix("Ammonia, NL") + assert name == "Ammonia", f"Expected name to be 'Ammonia', but got {name!r}" + assert location == "NL", f"Expected location to be 'NL', but got {location!r}" + + def test_location_code_with_extra_whitespace(self): + """Test split_location_suffix with extra whitespace.""" + name, location = split_location_suffix("Ammonia, \tNL") + assert name == "Ammonia", f"Expected name to be 'Ammonia', but got {name!r}" + assert location == "NL", f"Expected location to be 'NL', but got {location!r}" + + def test_complicated_location_code(self): + """Test split_location_suffix with complicated location code.""" + name, location = split_location_suffix("Ammonia, RER w/o DE+NL+NO") + assert name == "Ammonia", f"Expected name to be 'Ammonia', but got {name!r}" + assert ( + location == "RER w/o DE+NL+NO" + ), f"Expected location to be 'RER w/o DE+NL+NO', but got {location!r}" + + def test_no_location_code(self): + """Test split_location_suffix with no location code.""" + name, location = split_location_suffix("Ammonia") + assert name == "Ammonia", f"Expected name to be 'Ammonia', but got {name!r}" + assert location is None, f"Expected location to be None, but got {location!r}" + + def test_location_code_with_dash(self): + """Test split_location_suffix with location code using dash (should not match).""" + name, location = split_location_suffix("Ammonia-NL") + assert ( + name == "Ammonia-NL" + ), f"Expected name to be 'Ammonia-NL', but got {name!r}" + assert location is None, f"Expected location to be None, but got {location!r}" + + def test_location_code_case_insensitive_fails(self): + """Test split_location_suffix is case-insensitive for location codes.""" + name, location = split_location_suffix("Ammonia, nl") + assert ( + name == "Ammonia, nl" + ), f"Expected name to be 'Ammonia, nl', but got {name!r}" + assert location is None, f"Expected location to be 'None', but got {location!r}" + + def test_multiple_commas(self): + """Test split_location_suffix with multiple commas.""" + name, location = split_location_suffix("Ammonia, pure, NL") + # Should match the last comma followed by location code + assert ( + name == "Ammonia, pure" + ), f"Expected name to be 'Ammonia, pure', but got {name!r}" + assert location == "NL", f"Expected location to be 'NL', but got {location!r}" + + def test_location_code_in_middle(self): + """Test split_location_suffix with location code not at end.""" + name, location = split_location_suffix("Ammonia, NL, pure") + # Should not match because location code is not at the end + assert ( + name == "Ammonia, NL, pure" + ), f"Expected name to be 'Ammonia, NL, pure', but got {name!r}" + assert location is None, f"Expected location to be None, but got {location!r}" + + def test_empty_string(self): + """Test split_location_suffix with empty string.""" + name, location = split_location_suffix("") + assert name == "", f"Expected name to be '', but got {name!r}" + assert location is None, f"Expected location to be None, but got {location!r}" + + def test_only_location_code(self): + """Test split_location_suffix with only location code.""" + name, location = split_location_suffix(", NL") + assert name == "", f"Expected name to be '', but got {name!r}" + assert location == "NL", f"Expected location to be 'NL', but got {location!r}" + + def test_whitespace_before_comma(self): + """Test split_location_suffix with whitespace before comma.""" + name, location = split_location_suffix("Ammonia , NL") + # The regex requires comma immediately, so this might not match + # Testing actual behavior + assert ( + name == "Ammonia , NL" + ), f"Expected name to be 'Ammonia , NL' (no match), but got {name!r}" + assert location is None, f"Expected location to be None, but got {location!r}" + + def test_no_whitespace_after_comma(self): + """Test split_location_suffix with no whitespace after comma.""" + name, location = split_location_suffix("Ammonia,NL") + # The regex requires whitespace after comma + assert ( + name == "Ammonia,NL" + ), f"Expected name to be 'Ammonia,NL' (no match), but got {name!r}" + assert location is None, f"Expected location to be None, but got {location!r}" + + def test_various_location_codes(self): + """Test split_location_suffix with various location codes.""" + test_cases = [ + ("Water, DE", "Water", "DE"), + ("Water, FR", "Water", "FR"), + ("Water, US", "Water", "US"), + ("Water, GLO", "Water", "GLO"), + ] + for input_str, expected_name, expected_location in test_cases: + name, location = split_location_suffix(input_str) + assert ( + name == expected_name + ), f"Expected name to be {expected_name!r} for '{input_str}', but got {name!r}" + assert ( + location == expected_location + ), f"Expected location to be {expected_location!r} for '{input_str}', but got {location!r}" + + def test_complex_location_with_operators(self): + """Test split_location_suffix with complex location codes containing operators.""" + name, location = split_location_suffix("Ammonia, RER w/o DE+NL+NO") + assert name == "Ammonia", f"Expected name to be 'Ammonia', but got {name!r}" + assert ( + location == "RER w/o DE+NL+NO" + ), f"Expected location to be 'RER w/o DE+NL+NO', but got {location!r}" + + def test_location_code_with_trailing_whitespace(self): + """Test split_location_suffix with trailing whitespace after location.""" + name, location = split_location_suffix("Ammonia, NL ") + assert name == "Ammonia", f"Expected name to be 'Ammonia', but got {name!r}" + assert location == "NL", f"Expected location to be 'NL', but got {location!r}" + + +class TestReplaceLocationSuffix: + """Test replace_location_suffix function.""" + + def test_simple_location_replacement(self): + """Test replace_location_suffix with simple location code.""" + result = replace_location_suffix("Ammonia, NL", "DE") + assert result == "Ammonia, DE", f"Expected 'Ammonia, DE', but got {result!r}" + + def test_location_replacement_with_extra_whitespace(self): + """Test replace_location_suffix with extra whitespace.""" + result = replace_location_suffix("Ammonia, \tNL", "DE") + assert ( + result == "Ammonia, \tDE" + ), f"Expected 'Ammonia, \\tDE', but got {result!r}" + + def test_complicated_location_replacement(self): + """Test replace_location_suffix with complicated location code.""" + result = replace_location_suffix("Ammonia, RER w/o DE+NL+NO", "GLO") + assert result == "Ammonia, GLO", f"Expected 'Ammonia, GLO', but got {result!r}" + + def test_no_location_code_raises_missing_location(self): + """Test replace_location_suffix with no location code (should raise MissingLocation).""" + with pytest.raises(MissingLocation, match="No location suffix found"): + replace_location_suffix("Ammonia", "DE") + + def test_location_code_with_dash_raises_missing_location(self): + """Test replace_location_suffix with location code using dash (should raise MissingLocation).""" + with pytest.raises(MissingLocation, match="No location suffix found"): + replace_location_suffix("Ammonia-NL", "DE") + + def test_location_code_case_insensitive_raises_missing_location(self): + """Test replace_location_suffix with lowercase location (should raise MissingLocation).""" + with pytest.raises(MissingLocation, match="No location suffix found"): + replace_location_suffix("Ammonia, nl", "DE") + + def test_multiple_commas_replacement(self): + """Test replace_location_suffix with multiple commas.""" + result = replace_location_suffix("Ammonia, pure, NL", "FR") + # Should replace the last location code + assert ( + result == "Ammonia, pure, FR" + ), f"Expected 'Ammonia, pure, FR', but got {result!r}" + + def test_location_code_in_middle_raises_missing_location(self): + """Test replace_location_suffix with location code not at end (should raise MissingLocation).""" + with pytest.raises(MissingLocation, match="No location suffix found"): + replace_location_suffix("Ammonia, NL, pure", "DE") + + def test_empty_string_raises_missing_location(self): + """Test replace_location_suffix with empty string (should raise MissingLocation).""" + with pytest.raises(MissingLocation, match="No location suffix found"): + replace_location_suffix("", "DE") + + def test_only_location_code_replacement(self): + """Test replace_location_suffix with only location code.""" + result = replace_location_suffix(", NL", "DE") + assert result == ", DE", f"Expected ', DE', but got {result!r}" + + def test_whitespace_before_comma_raises_missing_location(self): + """Test replace_location_suffix with whitespace before comma (should raise MissingLocation).""" + with pytest.raises(MissingLocation, match="No location suffix found"): + replace_location_suffix("Ammonia , NL", "DE") + + def test_no_whitespace_after_comma_raises_missing_location(self): + """Test replace_location_suffix with no whitespace after comma (should raise MissingLocation).""" + with pytest.raises(MissingLocation, match="No location suffix found"): + replace_location_suffix("Ammonia,NL", "DE") + + def test_various_location_codes_replacement(self): + """Test replace_location_suffix with various location codes.""" + test_cases = [ + ("Water, DE", "FR", "Water, FR"), + ("Water, FR", "US", "Water, US"), + ("Water, US", "GLO", "Water, GLO"), + ("Water, GLO", "DE", "Water, DE"), + ] + for input_str, new_location, expected in test_cases: + result = replace_location_suffix(input_str, new_location) + assert ( + result == expected + ), f"Expected {expected!r} for '{input_str}' -> '{new_location}', but got {result!r}" + + def test_complex_location_with_operators_replacement(self): + """Test replace_location_suffix with complex location codes containing operators.""" + result = replace_location_suffix("Ammonia, RER w/o DE+NL+NO", "GLO") + assert result == "Ammonia, GLO", f"Expected 'Ammonia, GLO', but got {result!r}" + + def test_location_code_with_trailing_whitespace_replacement(self): + """Test replace_location_suffix with trailing whitespace after location.""" + result = replace_location_suffix("Ammonia, NL ", "DE") + assert ( + result == "Ammonia, DE " + ), f"Expected 'Ammonia, DE ' (preserving trailing space), but got {result!r}" + + def test_replace_with_empty_string(self): + """Test replace_location_suffix replacing location with empty string.""" + result = replace_location_suffix("Ammonia, NL", "") + assert ( + result == "Ammonia, " + ), f"Expected 'Ammonia, ' (empty location), but got {result!r}" + + def test_replace_with_longer_location(self): + """Test replace_location_suffix replacing with a longer location code.""" + result = replace_location_suffix("Ammonia, NL", "RER w/o DE+NL+NO") + assert ( + result == "Ammonia, RER w/o DE+NL+NO" + ), f"Expected 'Ammonia, RER w/o DE+NL+NO', but got {result!r}" + + def test_replace_with_shorter_location(self): + """Test replace_location_suffix replacing with a shorter location code.""" + result = replace_location_suffix("Ammonia, RER w/o DE+NL+NO", "NL") + assert result == "Ammonia, NL", f"Expected 'Ammonia, NL', but got {result!r}" diff --git a/tests/unit/test_string_field.py b/tests/unit/test_string_field.py new file mode 100644 index 0000000..bbf1e96 --- /dev/null +++ b/tests/unit/test_string_field.py @@ -0,0 +1,232 @@ +"""Unit tests for StringField class.""" + +from flowmapper.fields import StringField + + +class TestStringFieldInitialization: + """Test StringField initialization.""" + + def test_init_with_value(self): + """Test initialization with a value.""" + sf = StringField("test") + assert sf == "test", f"Expected sf to equal 'test', but got {sf!r}" + from collections import UserString + + assert isinstance( + sf, UserString + ), f"Expected sf to be an instance of UserString, but got {type(sf)}" + assert not isinstance( + sf, str + ), f"Expected sf to not be an instance of str (UserString is not a subclass), but got {type(sf)}" + + def test_init_with_empty_string(self): + """Test initialization with empty string.""" + sf = StringField("") + # Empty StringField doesn't equal empty string due to __eq__ implementation + assert sf != "", f"Expected sf to not equal '', but they are equal (sf={sf!r})" + assert sf.data == "", f"Expected sf.data to be '', but got {sf.data!r}" + + def test_init_with_whitespace(self): + """Test initialization with whitespace.""" + sf = StringField(" test ") + # Equality normalizes the other string, so " test " becomes "test" + assert sf == " test ", f"Expected sf to equal ' test ', but got {sf!r}" + assert ( + sf.data == " test " + ), f"Expected sf.data to be ' test ', but got {sf.data!r}" + + def test_inherits_from_userstring(self): + """Test that StringField inherits from UserString.""" + sf = StringField("test") + from collections import UserString + + assert isinstance( + sf, UserString + ), f"Expected sf to be an instance of UserString, but got {type(sf)}" + assert issubclass( + StringField, UserString + ), "Expected StringField to be a subclass of UserString, but it is not" + # UserString is not a subclass of str + assert not isinstance( + sf, str + ), f"Expected sf to not be an instance of str (UserString is not a subclass), but got {type(sf)}" + + +class TestStringFieldNormalize: + """Test StringField normalize method.""" + + def test_normalize_with_lowercase_default(self): + """Test normalize with default lowercase=True.""" + sf = StringField("TEST") + normalized = sf.normalize() + assert ( + normalized == "test" + ), f"Expected normalized to equal 'test', but got {normalized!r}" + assert isinstance( + normalized, StringField + ), f"Expected normalized to be a StringField instance, but got {type(normalized)}" + + def test_normalize_with_lowercase_false(self): + """Test normalize with lowercase=False.""" + sf = StringField("TEST") + normalized = sf.normalize(lowercase=False) + assert ( + normalized == "TEST" + ), f"Expected normalized to equal 'TEST', but got {normalized!r}" + + def test_normalize_with_whitespace(self): + """Test normalize with whitespace.""" + sf = StringField(" test ") + normalized = sf.normalize() + assert ( + normalized == "test" + ), f"Expected normalized to equal 'test', but got {normalized!r}" + + def test_normalize_returns_new_instance(self): + """Test that normalize returns a new instance.""" + sf = StringField("TEST") + normalized = sf.normalize() + assert ( + normalized is not sf + ), "Expected normalize() to return a new instance, but it returned the same instance" + assert sf == "TEST", f"Expected original sf to remain 'TEST', but got {sf!r}" + + +class TestStringFieldEq: + """Test StringField __eq__ method.""" + + def test_eq_with_same_stringfield(self): + """Test equality with same StringField instance.""" + sf1 = StringField("test") + sf2 = StringField("test") + assert ( + sf1 == sf2 + ), f"Expected sf1 to equal sf2, but they are not equal (sf1={sf1!r}, sf2={sf2!r})" + + def test_eq_with_different_stringfield(self): + """Test equality with different StringField.""" + sf1 = StringField("test") + sf2 = StringField("other") + assert ( + sf1 != sf2 + ), f"Expected sf1 to not equal sf2, but they are equal (sf1={sf1!r}, sf2={sf2!r})" + + def test_eq_with_string(self): + """Test equality with string.""" + sf = StringField("test") + assert ( + sf == "test" + ), f"Expected sf to equal 'test', but they are not equal (sf={sf!r})" + assert ( + sf != "other" + ), f"Expected sf to not equal 'other', but they are equal (sf={sf!r})" + + def test_eq_with_empty_stringfield(self): + """Test equality with empty StringField.""" + sf = StringField("") + assert sf != "", f"Expected sf to not equal '', but they are equal (sf={sf!r})" + assert ( + sf != "test" + ), f"Expected sf to not equal 'test', but they are equal (sf={sf!r})" + + def test_eq_with_other_type(self): + """Test equality with non-string, non-StringField type.""" + sf = StringField("test") + assert ( + sf != 123 + ), f"Expected sf to not equal 123, but they are equal (sf={sf!r})" + assert ( + sf != None + ), f"Expected sf to not equal None, but they are equal (sf={sf!r})" + assert sf != [], f"Expected sf to not equal [], but they are equal (sf={sf!r})" + + +class TestStringFieldStrBehavior: + """Test StringField string behavior (inherited from str).""" + + def test_str_operations(self): + """Test that StringField behaves like a string.""" + sf = StringField("test") + assert len(sf) == 4, f"Expected len(sf) to be 4, but got {len(sf)}" + assert ( + sf.upper() == "TEST" + ), f"Expected sf.upper() to be 'TEST', but got {sf.upper()!r}" + assert ( + sf.lower() == "test" + ), f"Expected sf.lower() to be 'test', but got {sf.lower()!r}" + assert sf.startswith( + "te" + ), f"Expected sf.startswith('te') to be True, but got {sf.startswith('te')}" + + def test_bool_with_non_empty_string(self): + """Test __bool__ with non-empty string (inherited from str).""" + sf = StringField("test") + assert bool(sf) is True, f"Expected bool(sf) to be True, but got {bool(sf)}" + + def test_bool_with_empty_string(self): + """Test __bool__ with empty string (inherited from str).""" + sf = StringField("") + assert bool(sf) is False, f"Expected bool(sf) to be False, but got {bool(sf)}" + + def test_bool_with_whitespace(self): + """Test __bool__ with whitespace-only string (inherited from str).""" + sf = StringField(" ") + assert ( + bool(sf) is True + ), f"Expected bool(sf) to be True for whitespace, but got {bool(sf)}" + + +class TestStringFieldEdgeCases: + """Test StringField edge cases.""" + + def test_value_preserved_after_normalize(self): + """Test that original value is preserved after normalize.""" + sf = StringField("ORIGINAL") + normalized = sf.normalize() + assert ( + sf == "ORIGINAL" + ), f"Expected original sf to remain 'ORIGINAL', but got {sf!r}" + assert ( + normalized == "original" + ), f"Expected normalized to be 'original', but got {normalized!r}" + + def test_multiple_normalize_calls(self): + """Test multiple normalize calls.""" + sf = StringField(" TEST ") + norm1 = sf.normalize() + norm2 = norm1.normalize() + assert norm1 == "test", f"Expected norm1 to be 'test', but got {norm1!r}" + assert norm2 == "test", f"Expected norm2 to be 'test', but got {norm2!r}" + + def test_equality_chain(self): + """Test equality chain with multiple StringFields.""" + sf1 = StringField("test") + sf2 = StringField("test") + sf3 = StringField("test") + assert ( + sf1 == sf2 == sf3 + ), f"Expected all StringFields to be equal, but they are not (sf1={sf1!r}, sf2={sf2!r}, sf3={sf3!r})" + + def test_normalize_with_different_lowercase_settings(self): + """Test normalize with different lowercase settings.""" + sf = StringField("TEST") + norm1 = sf.normalize(lowercase=True) + norm2 = sf.normalize(lowercase=False) + assert norm1 == "test", f"Expected norm1 to be 'test', but got {norm1!r}" + assert norm2 == "TEST", f"Expected norm2 to be 'TEST', but got {norm2!r}" + + def test_string_concatenation(self): + """Test that StringField can be concatenated like a string.""" + sf1 = StringField("hello") + sf2 = StringField("world") + result = sf1 + " " + sf2 + assert ( + result == "hello world" + ), f"Expected result to be 'hello world', but got {result!r}" + # UserString concatenation returns a new instance of the same class + assert isinstance( + result, StringField + ), f"Expected result to be a StringField instance, but got {type(result)}" + assert ( + result.data == "hello world" + ), f"Expected result.data to be 'hello world', but got {result.data!r}" diff --git a/tests/unit/test_unit.py b/tests/unit/test_unit.py new file mode 100644 index 0000000..8ff4dfa --- /dev/null +++ b/tests/unit/test_unit.py @@ -0,0 +1,271 @@ +import math + +import pytest + +from flowmapper.unit import UnitField + + +def test_equals_mass(): + u1 = UnitField("kg") + u2 = UnitField("kilogram") + + assert ( + u1 == u2 + ), f"Expected u1 to equal u2, but they are not equal (u1={u1!r}, u2={u2!r})" + + +def test_energy(): + u1 = UnitField("kilowatt hour") + u2 = UnitField("MJ") + assert u1.compatible( + u2 + ), f"Expected u1 to be compatible with u2, but they are not (u1={u1!r}, u2={u2!r})" + assert ( + u1.conversion_factor(u2) == 3.6 + ), f"Expected u1.conversion_factor(u2) to be 3.6, but got {u1.conversion_factor(u2)}" + + +def test_enrichment(): + u1 = UnitField("SWU") + u2 = UnitField("tonne * SW") + assert u1.compatible( + u2 + ), f"Expected u1 to be compatible with u2, but they are not (u1={u1!r}, u2={u2!r})" + assert ( + u1.conversion_factor(u2) == 1e-3 + ), f"Expected u1.conversion_factor(u2) to be 1e-3, but got {u1.conversion_factor(u2)}" + + +def test_natural_gas(): + u1 = UnitField("nm3") + u2 = UnitField("sm3") + assert u1.compatible( + u2 + ), f"Expected u1 to be compatible with u2, but they are not (u1={u1!r}, u2={u2!r})" + + +def test_livestock(): + u1 = UnitField("LU") + u2 = UnitField("livestock unit") + assert ( + u1.normalize() == u2.normalize() + ), f"Expected u1 to equal u2, but they are not equal (u1={u1!r}, u2={u2!r})" + + +def test_freight(): + u1 = UnitField("kilogram * km") + u2 = UnitField("tkm") + assert ( + u1.conversion_factor(u2) == 1e-3 + ), f"Expected u1.conversion_factor(u2) to be 1e-3, but got {u1.conversion_factor(u2)}" + + +def test_vehicular_travel(): + u1 = UnitField("vehicle * m") + u2 = UnitField("vkm") + assert ( + u1.conversion_factor(u2) == 1e-3 + ), f"Expected u1.conversion_factor(u2) to be 1e-3, but got {u1.conversion_factor(u2)}" + + +def test_person_travel(): + u1 = UnitField("person * m") + u2 = UnitField("pkm") + assert ( + u1.conversion_factor(u2) == 1e-3 + ), f"Expected u1.conversion_factor(u2) to be 1e-3, but got {u1.conversion_factor(u2)}" + + +def test_conversion_factor(): + u1 = UnitField("mg") + u2 = UnitField("kg") + actual = u1.conversion_factor(u2) + assert actual == 1e-06, f"Expected actual to be 1e-06, but got {actual}" + + +def test_nan_conversion_factor(): + u1 = UnitField("bq") + u2 = UnitField("kg") + actual = u1.conversion_factor(u2) + assert math.isnan(actual), f"Expected actual to be NaN, but got {actual}" + + +def test_complex_conversions(): + u1 = UnitField("square_meter_year / t") + u2 = UnitField("(meter ** 2 * month) / kg") + assert ( + u1.conversion_factor(u2) == 0.012 + ), f"Expected u1.conversion_factor(u2) to be 0.012, but got {u1.conversion_factor(u2)}" + + +class TestUnitFieldNormalize: + """Test UnitField normalize method.""" + + def test_normalize_with_valid_unit(self): + """Test normalize with valid unit.""" + u = UnitField("kg") + normalized = u.normalize() + assert ( + normalized == "kilogram" + ), f"Expected normalized to be 'kilogram', but got {normalized!r}" + assert isinstance( + normalized, UnitField + ), f"Expected normalized to be a UnitField instance, but got {type(normalized)}" + + def test_normalize_with_mapped_unit(self): + """Test normalize with unit that needs mapping.""" + # This tests the UNIT_MAPPING functionality + u = UnitField("kilogram") + normalized = u.normalize() + # The unit should be normalized through UNIT_MAPPING if applicable + assert isinstance( + normalized, UnitField + ), f"Expected normalized to be a UnitField instance, but got {type(normalized)}" + + def test_normalize_raises_error_undefined_unit(self): + """Test normalize raises error for undefined unit.""" + u = UnitField("unknown_unit_xyz") + with pytest.raises(ValueError, match="is unknown"): + u.normalize() + + +class TestUnitFieldEq: + """Test UnitField __eq__ method.""" + + def test_eq_with_same_data(self): + """Test equality with same data.""" + u1 = UnitField("kg") + u2 = UnitField("kg") + assert ( + u1 == u2 + ), f"Expected u1 to equal u2, but they are not equal (u1={u1!r}, u2={u2!r})" + + def test_eq_with_different_data_same_unit(self): + """Test equality with different data but same unit (conversion_factor == 1).""" + u1 = UnitField("kg") + u2 = UnitField("kilogram") + assert ( + u1 == u2 + ), f"Expected u1 to equal u2, but they are not equal (u1={u1!r}, u2={u2!r})" + + def test_eq_with_different_units(self): + """Test equality with different units.""" + u1 = UnitField("kg") + u2 = UnitField("g") + assert ( + u1 != u2 + ), f"Expected u1 to not equal u2, but they are equal (u1={u1!r}, u2={u2!r})" + + def test_eq_with_string(self): + """Test equality with string.""" + u = UnitField("kg") + assert u == "kg", f"Expected u to equal 'kg', but they are not equal (u={u!r})" + assert u != "g", f"Expected u to not equal 'g', but they are equal (u={u!r})" + + def test_eq_with_other_type(self): + """Test equality with other types.""" + u = UnitField("kg") + assert u != 123, f"Expected u to not equal 123, but they are equal (u={u!r})" + assert u != None, f"Expected u to not equal None, but they are equal (u={u!r})" + assert u != [], f"Expected u to not equal [], but they are equal (u={u!r})" + + +class TestUnitFieldCompatible: + """Test UnitField compatible method.""" + + def test_compatible_with_compatible_units(self): + """Test compatible with compatible units.""" + u1 = UnitField("kg") + u2 = UnitField("g") + assert u1.compatible( + u2 + ), f"Expected u1 to be compatible with u2, but they are not (u1={u1!r}, u2={u2!r})" + + def test_compatible_with_incompatible_units(self): + """Test compatible with incompatible units.""" + u1 = UnitField("kg") + u2 = UnitField("meter") + assert not u1.compatible( + u2 + ), f"Expected u1 to not be compatible with u2, but they are (u1={u1!r}, u2={u2!r})" + + def test_compatible_with_same_unit(self): + """Test compatible with same unit.""" + u1 = UnitField("kg") + u2 = UnitField("kg") + assert u1.compatible( + u2 + ), f"Expected u1 to be compatible with u2, but they are not (u1={u1!r}, u2={u2!r})" + + def test_compatible_with_non_unitfield(self): + """Test compatible with non-UnitField type.""" + u1 = UnitField("kg") + # Strings are now supported and work with compatible() + assert u1.compatible( + "kg" + ), f"Expected u1 to be compatible with 'kg' string (strings are now supported), but it is not (u1={u1!r})" + # Non-string, non-UnitField types should return False + assert not u1.compatible( + 123 + ), f"Expected u1 to not be compatible with 123, but it is (u1={u1!r})" + + +class TestUnitFieldConversionFactor: + """Test UnitField conversion_factor method.""" + + def test_conversion_factor_with_same_data(self): + """Test conversion_factor with same data.""" + u1 = UnitField("kg") + u2 = UnitField("kg") + result = u1.conversion_factor(u2) + assert result == 1.0, f"Expected conversion_factor to be 1.0, but got {result}" + + def test_conversion_factor_with_non_unitfield(self): + """Test conversion_factor with non-UnitField type.""" + u1 = UnitField("kg") + # Strings are now supported and work with conversion_factor() + result = u1.conversion_factor("kg") + assert ( + result == 1.0 + ), f"Expected conversion_factor to be 1.0 for same unit string, but got {result}" + # Non-string, non-UnitField types should return NaN + result2 = u1.conversion_factor(123) + assert math.isnan( + result2 + ), f"Expected conversion_factor to be NaN for non-UnitField, non-string type, but got {result2}" + + def test_conversion_factor_with_undefined_unit(self): + """Test conversion_factor with undefined unit.""" + u1 = UnitField("kg") + u2 = UnitField("unknown_unit_xyz") + result = u1.conversion_factor(u2) + assert math.isnan( + result + ), f"Expected conversion_factor to be NaN for undefined unit, but got {result}" + + def test_conversion_factor_with_dimensionality_error(self): + """Test conversion_factor with dimensionality error.""" + u1 = UnitField("kg") + u2 = UnitField("meter") + result = u1.conversion_factor(u2) + assert math.isnan( + result + ), f"Expected conversion_factor to be NaN for incompatible units, but got {result}" + + def test_conversion_factor_zero_to_one(self): + """Test conversion_factor from zero to one.""" + u1 = UnitField("mg") + u2 = UnitField("kg") + result = u1.conversion_factor(u2) + assert ( + result == 1e-06 + ), f"Expected conversion_factor to be 1e-06, but got {result}" + + def test_conversion_factor_one_to_zero(self): + """Test conversion_factor from one to zero.""" + u1 = UnitField("kg") + u2 = UnitField("mg") + result = u1.conversion_factor(u2) + assert ( + result == 1e06 + ), f"Expected conversion_factor to be 1e06, but got {result}"