From b7aff0fc2d34b46fcbebcbdd8e53b2206e1c26a4 Mon Sep 17 00:00:00 2001 From: Austin Noto-Moniz Date: Mon, 5 Jan 2026 10:08:14 -0500 Subject: [PATCH] Deprecate CSVDataSource. It has long been supplanted by GemTableDataSource, which is itself growing long in the tooth. --- docs/source/workflows/data_sources.rst | 134 ------------------ .../predictor_evaluation_workflows.rst | 12 +- docs/source/workflows/predictors.rst | 119 ++-------------- src/citrine/__version__.py | 2 +- src/citrine/informatics/data_sources.py | 3 + src/citrine/resources/descriptors.py | 2 +- tests/informatics/test_data_source.py | 37 +++-- 7 files changed, 42 insertions(+), 267 deletions(-) diff --git a/docs/source/workflows/data_sources.rst b/docs/source/workflows/data_sources.rst index fa6165d1e..0ee5b9e42 100644 --- a/docs/source/workflows/data_sources.rst +++ b/docs/source/workflows/data_sources.rst @@ -6,140 +6,6 @@ Data Sources Data sources are used by modules to pull data from outside of the AI engine. For example, a :doc:`predictor ` may need to define external training data. -CSV Data Source ---------------- - -The :class:`~citrine.informatics.data_sources.CSVDataSource` draws data from a CSV file stored on the data platform and annotates it by mapping each column name to a :class:`~citrine.informatics.descriptors.Descriptor`. -The file is referenced via a :class:`~citrine.resources.file_link.FileLink`. -Each FileLink references an explicit version of the CSV file. -Uploading a new file with the same name will produce a new version of the file with a new FileLink. - -The columns in the CSV are extracted and parsed by a mapping of column header names to user-created descriptors. -Columns in the CSV that are not mapped with a descriptor are ignored. - -Assume that a file data.csv exists with the following contents: - -.. code:: - - Chemical Formula,Gap,Crystallinity - Bi2Te3,0.153,Single crystalline - Mg2Ge,0.567,Single crystalline - GeTe,0.7,Amorphous - Sb2Se3,1.15,Polycrystalline - -That file could be used as the training data for a predictor as: - -.. code:: python - - from citrine.informatics.data_sources import CSVDataSource - from citrine.informatics.predictors import AutoMLPredictor - from citrine.informatics.descriptors import RealDescriptor, CategoricalDescriptor, ChemicalFormulaDescriptor - - file_link = dataset.files.upload("./data.csv", "bandgap_data.csv") - - data_source = CSVDataSource( - file_link = file_link, - # `column_definitions` maps a column header to a descriptor - # the column header and the descriptor key do not need to be identical - column_definitions = { - "Chemical Formula": ChemicalFormulaDescriptor(key="formula"), - "Gap": RealDescriptor(key="Band gap", lower_bound=0, upper_bound=20, units="eV"), - "Crystallinity": CategoricalDescriptor(key="Crystallinity", categories=[ - "Single crystalline", "Amorphous", "Polycrystalline"]) - } - ) - - predictor = AutoMLPredictor( - name = "Band gap predictor", - description = "Predict the band gap from the chemical formula and crystallinity", - inputs = [ - # referencing `data_source.column_definitions` is one way to ensure that the - # descriptors in the training data match the descriptors in the predictor definition - data_source.column_definitions["Chemical Formula"], - data_source.column_definitions["Crystallinity"] - ], - outputs = [data_source.column_definitions["Gap"]], - training_data = [data_source] - ) - -An optional list of identifiers can be specified. -Identifiers uniquely identify a row and are used in the context of formulations to link from an ingredient to its properties. -Each identifier must correspond to a column header name. -No two rows within this column can contain the same value. -If a column should be parsed as data and used as an identifier, identifier columns may overlap with the keys defined in the mapping from column header names to descriptors. - -Identifiers are required in two circumstances. -These circumstances are only relevant if CSV data source represents formulation data. - -1. Ingredient properties are featurized using a :class:`~citrine.informatics.predictors.mean_property_predictor.MeanPropertyPredictor`. - In this case, the link from identifier to row is used to compute mean ingredient property values. -2. Mixtures that contain mixtures are simplified to simple mixtures that contain only leaf ingredients using a :class:`~citrine.informatics.predictors.simple_mixture_predictor.SimpleMixturePredictor`. - In this case, links from each mixture's ingredients to its row (which may also be a mixture) are used to recursively crawl hierarchical blends of blends and construct a recipe that contains only leaf ingredients. - -Note: to build a formulation from a CSV data source an :class:`~citrine.informatics.predictors.ingredients_to_formulation_predictor.IngredientsToFormulationPredictor` must be present in the workflow. -Additionally, each ingredient id used as a key in the predictor's map from ingredient id to its quantity must exist in an identifier column. - -As an example, consider the following saline solution data. - -+-------------------+----------------+---------------+---------+ -| Ingredient id | water quantity | salt quantity | density | -+===================+================+===============+=========+ -| hypertonic saline | 0.93 | 0.07 | 1.08 | -+-------------------+----------------+---------------+---------+ -| isotonic saline | 0.99 | 0.01 | 1.01 | -+-------------------+----------------+---------------+---------+ -| water | | | 1.0 | -+-------------------+----------------+---------------+---------+ -| salt | | | 2.16 | -+-------------------+----------------+---------------+---------+ - -Hypertonic and isotonic saline are mixtures formed by mixing water and salt. -Ingredient identifiers are given by the first column. -A CSV data source and an :class:`~citrine.informatics.predictors.ingredients_to_formulation_predictor.IngredientsToFormulationPredictor` -can be configured to construct formulations from tabular data. -This predictor automatically generates an ``output`` formulation descriptor named 'Formulation'. -The following example illustrates this process: - -.. code:: python - - from citrine.informatics.data_sources import CSVDataSource - from citrine.informatics.descriptors import FormulationDescriptor, RealDescriptor - from citrine.informatics.predictors import IngredientsToFormulationPredictor - - file_link = dataset.files.upload(file_path="./saline_solutions.csv", dest_name="saline_solutions.csv") - - # create descriptors for each ingredient quantity (volume fraction) - water_quantity = RealDescriptor(key='water quantity', lower_bound=0, upper_bound=1, units="") - salt_quantity = RealDescriptor(key='salt quantity', lower_bound=0, upper_bound=1, units="") - - # create a descriptor to hold density data - density = RealDescriptor(key='density', lower_bound=0, upper_bound=1000, units='g/cc') - - data_source = CSVDataSource( - file_link = file_link, - column_definitions = { - 'water quantity': water_quantity, - 'salt quantity': salt_quantity, - 'density': density - }, - identifiers=['Ingredient id'] - ) - - IngredientsToFormulationPredictor( - name='Ingredients to formulation predictor', - description='Constructs a mixture from ingredient quantities', - # map from ingredient id to its quantity - id_to_quantity={ - 'water': water_quantity, - 'salt': salt_quantity - }, - # label water as a solvent and salt a solute - labels={ - 'solvent': {'water'}, - 'solute': {'salt'} - } - ) - GEM Table Data Source --------------------- diff --git a/docs/source/workflows/predictor_evaluation_workflows.rst b/docs/source/workflows/predictor_evaluation_workflows.rst index ca29fff45..874ffa996 100644 --- a/docs/source/workflows/predictor_evaluation_workflows.rst +++ b/docs/source/workflows/predictor_evaluation_workflows.rst @@ -158,18 +158,14 @@ The predictor we'll evaluate is defined below: .. code:: python - from citrine.informatics.data_sources import CSVDataSource + from citrine.informatics.data_sources import GemTableDataSource from citrine.informatics.descriptors import RealDescriptor from citrine.informatics.predictors import AutoMLPredictor + data_source = GemTableDataSource(table_id=training_data_table_uid, table_version=training_data_table_version) + x = RealDescriptor(key='x', lower_bound=0.0, upper_bound=1.0, units='') y = RealDescriptor(key='y', lower_bound=0.0, upper_bound=1.0, units='') - - data_source = CSVDataSource( - file_link=file, # path to CSV that contains training data for x and y - column_definitions={'x': x, 'y': y} - ) - predictor = AutoMLPredictor( name='y predictor', description='predicts y given x', @@ -179,7 +175,7 @@ The predictor we'll evaluate is defined below: ) This predictor expects ``x`` as an input and predicts ``y``. -Training data is provided by a :class:`~citrine.informatics.data_sources.CSVDataSource` that assumes ``filename`` represents the path to a CSV that contains ``x`` and ``y``. +Training data is provided by a :class:`~citrine.informatics.data_sources.GemTableDataSource` that contains ``x`` and ``y``. Next, create a project and register the predictor: diff --git a/docs/source/workflows/predictors.rst b/docs/source/workflows/predictors.rst index 8b03129e0..8472e0379 100644 --- a/docs/source/workflows/predictors.rst +++ b/docs/source/workflows/predictors.rst @@ -55,7 +55,7 @@ The following example demonstrates how to use the Citrine Python client to creat description = 'Predictor description', inputs = [input_descriptor_1, input_descriptor_2], outputs = [output_descriptor_1], - training_data = [GemTableDataSource(table_id=training_data_table_uid, table_version=1)] + training_data = [GemTableDataSource(table_id=training_data_table_uid, table_version=training_data_table_version)] ) predictor = create_or_update(collection=project.predictors, @@ -79,26 +79,19 @@ Training data can be specified when creating a graph predictor. This will be combined with any training data present in the sub-predictors. The following example demonstrates how to create a :class:`~citrine.informatics.predictors.graph_predictor.GraphPredictor` from on-platform and locally-defined predictors. -Assume that there exists a CSV file with columns for time, bulk modulus, and Poisson's ratio. +Assume that there exists a GEMD table with columns for time, bulk modulus, and Poisson's ratio. We train ML models to predict bulk modulus and Poisson's ratio, then apply an expression to calculate Young's modulus. The ML models are independently registered on-platform, but the expression predictor is defined locally and hence cannot be re-used. .. code:: python from citrine.informatics.predictors import GraphPredictor, AutoMLPredictor, ExpressionPredictor - from citrine.informatics.data_sources import CSVDataSource + from citrine.informatics.data_sources import GemTableDataSource time = RealDescriptor("tempering time", lower_bound=0, upper_bound=30, units="s") bulk_modulus = RealDescriptor("Bulk Modulus", lower_bound=0, upper_bound=1E3, units="GPa") poissons_ratio = RealDescriptor("Poisson\'s Ratio", lower_bound=0, upper_bound=0.5, units="") - training_data = CSVDataSource( - file_link = elastic_properties_file, - column_definition = { - "Tempering Time (s)": time, - "Bulk Modulus (GPa)": bulk_modulus, - "Poisson\'s Ratio": poissons_ratio - } - ) + training_data = GemTableDataSource(table_id=training_data_table_uid, table_version=training_data_table_version) bulk_modulus_predictor = project.predictors.register( AutoMLPredictor( @@ -267,7 +260,7 @@ The following example demonstrates how to use a :class:`~citrine.informatics.pre name='Density from solvent molecular structure', description='Predict the density from the solvent molecular structure using molecular structure features.', predictors = [featurizer, ml_predictor], - training_data = [GemTableDataSource(table_id=training_data_table_uid, table_version=1)] # training data shared by all sub-predictors + training_data = [GemTableDataSource(table_id=training_data_table_uid, table_version=training_data_table_version)] # training data shared by all sub-predictors ) # register or update predictor by name @@ -347,7 +340,7 @@ The following example demonstrates how to use a :class:`~citrine.informatics.pre name='Melting temperature from alloy chemical formula', description='Predict the melting temperature from the alloy chemical formula using chemical formula features.', predictors = [featurizer, ml_predictor], - training_data = [GemTableDataSource(table_id=training_data_table_uid, table_version=1)] # training data shared by all sub-predictors + training_data = [GemTableDataSource(table_id=training_data_table_uid, table_version=training_data_table_version)] # training data shared by all sub-predictors ) # register or update predictor by name @@ -357,102 +350,6 @@ The following example demonstrates how to use a :class:`~citrine.informatics.pre ) -Ingredients to formulation predictor (ALPHA) --------------------------------------------------- - -The :class:`~citrine.informatics.predictors.ingredients_to_formulation_predictor.IngredientsToFormulationPredictor` constructs a formulation from a list of ingredients. -This predictor is only required to construct formulations from CSV data sources. -Formulations are constructed automatically by GEM Tables when the underlying GEMD data contains formulations, so -an :class:`~citrine.informatics.predictors.ingredients_to_formulation_predictor.IngredientsToFormulationPredictor` is not required in those cases. - -Ingredients are specified by a map from ingredient id to the descriptor that contains the ingredient's quantity. -For example, ``{'water': RealDescriptor('water quantity', lower_bound=0, upper_bound=1, units='')}`` defines an ingredient ``water`` with quantity held by the descriptor ``water quantity``. -There must be a corresponding (id, quantity) pair in the map for all possible ingredients. -If a material does not contain data for a given quantity descriptor key, it is assumed that ingredient is not present in the mixture. - -Let's add another ingredient ``salt`` to our map and say we are given data in the form: - -+-------------------+----------------+---------------+----------------+ -| Ingredient id | water quantity | salt quantity | density (g/cc) | -+===================+================+===============+================+ -| hypertonic saline | 0.93 | 0.07 | 1.08 | -+-------------------+----------------+---------------+----------------+ -| isotonic saline | 0.99 | 0.01 | 1.01 | -+-------------------+----------------+---------------+----------------+ -| water | | | 1.0 | -+-------------------+----------------+---------------+----------------+ -| salt | | | 2.16 | -+-------------------+----------------+---------------+----------------+ - -There are two mixtures, hypertonic and isotonic saline, formed by mixing water and salt together in different amounts. -(Note, water and salt are leaf ingredients; hence these rows do not contain quantity data.) -Mixtures are defined by a map from ingredient id to quantity, so this predictor would form 2 mixtures with recipes: - -.. code:: python - - # hypertonic saline - {'water': 0.93, 'salt': 0.07} - - # isotonic saline - {'water': 0.99, 'salt': 0.01} - -Ingredients may be given 0 or more labels. -Labels provide a way to group or distinguish one or more ingredients and can be used to featurize mixtures (discussed in the next section). -The same label may be given to multiple ingredients, and a single ingredient may be given multiple labels. -Labels are specified using a map from each label to a list of all ingredients that should be given that label. -Anytime a recipe contains a non-zero amount of labeled ingredient, the ingredient is assigned the label. -For example, we may wish to label ``water`` as a solvent and ``salt`` as a solute. -These labels are specified via: - -.. code:: python - - labels = {"solvent": {"water"}, "solute": {"salt"}} - -The following example illustrates how an -:class:`~citrine.informatics.predictors.ingredients_to_formulation_predictor.IngredientsToFormulationPredictor` -is constructed for the saline example. - -.. code:: python - - from citrine.informatics.descriptors import FormulationDescriptor, RealDescriptor - from citrine.informatics.predictors import IngredientsToFormulationPredictor - - file_link = dataset.files.upload(file_path="./saline_solutions.csv", dest_name"saline_solutions.csv") - - # create descriptors for each ingredient quantity (volume fraction) - water_quantity = RealDescriptor(key='water quantity', lower_bound=0, upper_bound=1, units='') - salt_quantity = RealDescriptor(key='salt quantity', lower_bound=0, upper_bound=1, units='') - - # create a descriptor to hold density data - density = RealDescriptor(key='density', lower_bound=0, upper_bound=1000, units='g/cc') - - data_source = CSVDataSource( - file_link = file_link, - column_definitions = { - 'water quantity': water_quantity, - 'salt quantity': salt_quantity, - 'density': density - }, - identifiers=['Ingredient id'] - ) - - IngredientsToFormulationPredictor( - name='Ingredients to formulation predictor', - description='Constructs a mixture from ingredient quantities', - # map from ingredient id to its quantity - id_to_quantity={ - 'water': water_quantity, - 'salt': salt_quantity - }, - # label water as a solvent and salt a solute - labels={ - "solvent": {"water"}, - "solute": {"salt"} - }, - training_data=[data_source] - ) - - Simple mixture predictor ------------------------ @@ -479,7 +376,7 @@ yielding just the quantities of the leaf constituents salt and water. # table with simple mixtures and their ingredients data_source = GemTableDataSource( table_id=table_uid, - table_version=1 + table_version=table_version ) SimpleMixturePredictor( @@ -577,7 +474,7 @@ to compute the mean solute density and the distribution of acetone solubility in # table with formulations and their ingredients data_source = GemTableDataSource( table_id=table_uid, - table_version=1 + table_version=table_version ) mean_property_predictor = MeanPropertyPredictor( diff --git a/src/citrine/__version__.py b/src/citrine/__version__.py index 62bfee6c5..87dca0296 100644 --- a/src/citrine/__version__.py +++ b/src/citrine/__version__.py @@ -1 +1 @@ -__version__ = "3.27.0" +__version__ = "3.28.0" diff --git a/src/citrine/informatics/data_sources.py b/src/citrine/informatics/data_sources.py index 153d9e6ac..77bed62d6 100644 --- a/src/citrine/informatics/data_sources.py +++ b/src/citrine/informatics/data_sources.py @@ -102,6 +102,9 @@ def __init__(self, file_link: FileLink, column_definitions: Mapping[str, Descriptor], identifiers: Optional[List[str]] = None): + warn("CSVDataSource is deprecated as of 3.28.0 and will be removed in 4.0.0. Please use " + "another type of data source, such as GemTableDataSource.", + category=DeprecationWarning) self.file_link = file_link self.column_definitions = column_definitions self.identifiers = identifiers diff --git a/src/citrine/resources/descriptors.py b/src/citrine/resources/descriptors.py index 13c3441e8..d34199ca1 100644 --- a/src/citrine/resources/descriptors.py +++ b/src/citrine/resources/descriptors.py @@ -58,7 +58,7 @@ def from_data_source(self, *, data_source: DataSource) -> List[Descriptor]: Parameters ---------- data_source : DataSource - A CSVDataSource or AraTableDataSource to get descriptors for. + The data source to get descriptors for. Returns ------- diff --git a/tests/informatics/test_data_source.py b/tests/informatics/test_data_source.py index 7890c9241..b1b4e2a06 100644 --- a/tests/informatics/test_data_source.py +++ b/tests/informatics/test_data_source.py @@ -13,12 +13,8 @@ from tests.utils.factories import GemTableDataFactory @pytest.fixture(params=[ - CSVDataSource(file_link=FileLink("foo.spam", "http://example.com"), - column_definitions={"spam": RealDescriptor("eggs", lower_bound=0, upper_bound=1.0, units="")}, - identifiers=["identifier"]), GemTableDataSource(table_id=uuid.uuid4(), table_version=1), GemTableDataSource(table_id=uuid.uuid4(), table_version="2"), - GemTableDataSource(table_id=uuid.uuid4(), table_version="2"), ExperimentDataSourceRef(datasource_id=uuid.uuid4()), SnapshotDataSource(snapshot_id=uuid.uuid4()) ]) @@ -47,14 +43,7 @@ def test_invalid_deser(): def test_data_source_id(data_source): - if isinstance(data_source, CSVDataSource): - # TODO: There's no obvious way to recover the column_definitions & identifiers from the ID - with pytest.warns(UserWarning): - transformed = DataSource.from_data_source_id(data_source.to_data_source_id()) - assert isinstance(data_source, CSVDataSource) - assert transformed.file_link == data_source.file_link - else: - assert data_source == DataSource.from_data_source_id(data_source.to_data_source_id()) + assert data_source == DataSource.from_data_source_id(data_source.to_data_source_id()) def test_from_gem_table(): table = GemTable.build(GemTableDataFactory()) @@ -65,3 +54,27 @@ def test_from_gem_table(): def test_invalid_data_source_id(): with pytest.raises(ValueError): DataSource.from_data_source_id(f"Undefined::{uuid.uuid4()}") + + +def test_deser_from_parent_deprecated(): + with pytest.deprecated_call(): + data_source = CSVDataSource(file_link=FileLink("foo.spam", "http://example.com"), + column_definitions={"spam": RealDescriptor("eggs", lower_bound=0, upper_bound=1.0, units="")}, + identifiers=["identifier"]) + + # Serialize and deserialize the descriptors, making sure they are round-trip serializable + data = data_source.dump() + data_source_deserialized = DataSource.build(data) + assert data_source == data_source_deserialized + +def test_data_source_id_deprecated(): + with pytest.deprecated_call(): + data_source = CSVDataSource(file_link=FileLink("foo.spam", "http://example.com"), + column_definitions={"spam": RealDescriptor("eggs", lower_bound=0, upper_bound=1.0, units="")}, + identifiers=["identifier"]) + + # TODO: There's no obvious way to recover the column_definitions & identifiers from the ID + with pytest.deprecated_call(): + with pytest.warns(UserWarning): + transformed = DataSource.from_data_source_id(data_source.to_data_source_id()) + assert transformed.file_link == data_source.file_link