From eaf99a3bd5f8fe5884927a48526cc6aaec46ca8d Mon Sep 17 00:00:00 2001 From: HugoOnghai Date: Fri, 20 Jun 2025 11:26:06 -0700 Subject: [PATCH 01/26] (after rebase) updating dependencies after upgrading... pip freeze > requirements.txt the old requirements are saved in oldrequirements.txt for reference --- oldrequirements.txt | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 oldrequirements.txt diff --git a/oldrequirements.txt b/oldrequirements.txt new file mode 100644 index 0000000..495f0a2 --- /dev/null +++ b/oldrequirements.txt @@ -0,0 +1,8 @@ +xmltodict==0.12.0 +urllib3==1.25.9 +dicttoxml==1.7.4 +tqdm==4.46.0 +bibtexparser==1.2.0 +monty==3.0.2 +nbformat==5.0.7 +nbconvert==5.6.1 From 6b96d338c11e70ef6b17da3eef05ff48f6c9cd8a Mon Sep 17 00:00:00 2001 From: HugoOnghai Date: Fri, 20 Jun 2025 18:42:41 -0700 Subject: [PATCH 02/26] Trying to get test_doi_builder to work - deserialization works - re-serialization does not, there are additional dictionary keys that exist that don't match the provided config_file. will need to discuss the goal of this doi_builder a bit more to understand it --- legacy/files/config-example.json | 10 +--------- legacy/mpcite/doi_builder.py | 2 +- legacy/mpcite/models.py | 8 ++++++++ legacy/tests/test_doi_builder.py | 18 +++++++++--------- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/legacy/files/config-example.json b/legacy/files/config-example.json index 8719649..a18b14b 100644 --- a/legacy/files/config-example.json +++ b/legacy/files/config-example.json @@ -45,14 +45,6 @@ "username": "", "password": "" }, - "elsevier": { - "endpoint": "https://push-feature.datasearch.elsevier.com/container", - "username": "", - "password": "" - }, "max_doi_requests": 0, - "sync": false, - "@module": "mpcite.doi_builder", - "@class": "DoiBuilder", - "@version": null + "sync": false } diff --git a/legacy/mpcite/doi_builder.py b/legacy/mpcite/doi_builder.py index e3e6708..c42e45d 100644 --- a/legacy/mpcite/doi_builder.py +++ b/legacy/mpcite/doi_builder.py @@ -218,7 +218,7 @@ def from_dict(cls, d: dict): json.dumps(d["robocrys_collection"]), cls=MontyDecoder ) doi_store = json.loads(json.dumps(d["dois_collection"]), cls=MontyDecoder) - report_emails = d["report_emails"] + report_emails = d["report_emails"] if "report_emails" in d else None max_doi_requests = d["max_doi_requests"] sync = d["sync"] diff --git a/legacy/mpcite/models.py b/legacy/mpcite/models.py index b2fab65..1e989e8 100644 --- a/legacy/mpcite/models.py +++ b/legacy/mpcite/models.py @@ -317,3 +317,11 @@ class ExplorerGetJSONResponseModel(BaseModel): sponsor_orgs: List[str] research_orgs: List[str] links: List[Dict[str, str]] + +# Added to resolve failed import in test_doi_builder.py +class MongoConnectionModel(ConnectionModel): + database: str = Field(..., title="MongoDB Database Name") + collection: str = Field(..., title="MongoDB Collection Name") + + def get_connection_string(self) -> str: + return f"mongodb://{self.username}:{self.password}@{self.endpoint}/{self.database}" \ No newline at end of file diff --git a/legacy/tests/test_doi_builder.py b/legacy/tests/test_doi_builder.py index 00a1fab..8d17885 100644 --- a/legacy/tests/test_doi_builder.py +++ b/legacy/tests/test_doi_builder.py @@ -2,13 +2,12 @@ import os import pytest import json -from mpcite.doi_builder import DoiBuilder -from mpcite.models import OSTIModel, MongoConnectionModel, ConnectionModel - +from mpcite.doi_builder import DOIBuilder +from mpcite.models import OSTIDOIRecordModel, MongoConnectionModel, ConnectionModel @pytest.fixture def config_file_path(): - return Path(os.getcwd()) / "files" / "config_test.json" + return Path(os.getcwd()) / "files" / "config-example.json" def test_builder_serialization(config_file_path: Path): @@ -17,17 +16,18 @@ def test_builder_serialization(config_file_path: Path): # test deserialize d: dict = json.load(config_file) try: - doi_builder = DoiBuilder.from_dict(d=d) + doi_builder = DOIBuilder.from_dict(d=d) except Exception as e: assert False, f"Unable to build DOI Builder from config file. Error: {e}" # test serialization - new_d = doi_builder.as_dict() + print(new_d.keys()) + print(d.keys()) assert new_d.keys() == d.keys() - new_osti = OSTIModel.parse_obj(new_d["osti"]) + new_osti = OSTIDOIRecordModel.parse_obj(new_d["osti"]) new_elsevier = ConnectionModel.parse_obj(new_d["elsevier"]) new_materials_connection = MongoConnectionModel.parse_obj( new_d["materials_collection"] @@ -39,7 +39,7 @@ def test_builder_serialization(config_file_path: Path): new_d["robocrys_collection"] ) - true_osti = OSTIModel.parse_obj(d["osti"]) + true_osti = OSTIDOIRecordModel.parse_obj(d["osti"]) true_elsevier = ConnectionModel.parse_obj(new_d["elsevier"]) true_materials_connection = MongoConnectionModel.parse_obj( new_d["materials_collection"] @@ -59,4 +59,4 @@ def test_builder_serialization(config_file_path: Path): assert ( new_dois_collection_connection.dict() == true_dois_collection_connection.dict() ) - assert new_robocrys_collection_connection == true_robocrys_collection_connection + assert new_robocrys_collection_connection == true_robocrys_collection_connection \ No newline at end of file From 46a688d45ac5ab499c32e7901b8df21e5c69f24e Mon Sep 17 00:00:00 2001 From: HugoOnghai Date: Mon, 23 Jun 2025 13:34:12 -0700 Subject: [PATCH 03/26] Revert "Trying to get test_doi_builder to work" This reverts commit afa51f76eece8a2650bafbd2b9e2b67199f231d7. --- legacy/files/config-example.json | 10 +++++++++- legacy/mpcite/doi_builder.py | 2 +- legacy/mpcite/models.py | 8 -------- legacy/tests/test_doi_builder.py | 18 +++++++++--------- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/legacy/files/config-example.json b/legacy/files/config-example.json index a18b14b..8719649 100644 --- a/legacy/files/config-example.json +++ b/legacy/files/config-example.json @@ -45,6 +45,14 @@ "username": "", "password": "" }, + "elsevier": { + "endpoint": "https://push-feature.datasearch.elsevier.com/container", + "username": "", + "password": "" + }, "max_doi_requests": 0, - "sync": false + "sync": false, + "@module": "mpcite.doi_builder", + "@class": "DoiBuilder", + "@version": null } diff --git a/legacy/mpcite/doi_builder.py b/legacy/mpcite/doi_builder.py index c42e45d..e3e6708 100644 --- a/legacy/mpcite/doi_builder.py +++ b/legacy/mpcite/doi_builder.py @@ -218,7 +218,7 @@ def from_dict(cls, d: dict): json.dumps(d["robocrys_collection"]), cls=MontyDecoder ) doi_store = json.loads(json.dumps(d["dois_collection"]), cls=MontyDecoder) - report_emails = d["report_emails"] if "report_emails" in d else None + report_emails = d["report_emails"] max_doi_requests = d["max_doi_requests"] sync = d["sync"] diff --git a/legacy/mpcite/models.py b/legacy/mpcite/models.py index 1e989e8..b2fab65 100644 --- a/legacy/mpcite/models.py +++ b/legacy/mpcite/models.py @@ -317,11 +317,3 @@ class ExplorerGetJSONResponseModel(BaseModel): sponsor_orgs: List[str] research_orgs: List[str] links: List[Dict[str, str]] - -# Added to resolve failed import in test_doi_builder.py -class MongoConnectionModel(ConnectionModel): - database: str = Field(..., title="MongoDB Database Name") - collection: str = Field(..., title="MongoDB Collection Name") - - def get_connection_string(self) -> str: - return f"mongodb://{self.username}:{self.password}@{self.endpoint}/{self.database}" \ No newline at end of file diff --git a/legacy/tests/test_doi_builder.py b/legacy/tests/test_doi_builder.py index 8d17885..00a1fab 100644 --- a/legacy/tests/test_doi_builder.py +++ b/legacy/tests/test_doi_builder.py @@ -2,12 +2,13 @@ import os import pytest import json -from mpcite.doi_builder import DOIBuilder -from mpcite.models import OSTIDOIRecordModel, MongoConnectionModel, ConnectionModel +from mpcite.doi_builder import DoiBuilder +from mpcite.models import OSTIModel, MongoConnectionModel, ConnectionModel + @pytest.fixture def config_file_path(): - return Path(os.getcwd()) / "files" / "config-example.json" + return Path(os.getcwd()) / "files" / "config_test.json" def test_builder_serialization(config_file_path: Path): @@ -16,18 +17,17 @@ def test_builder_serialization(config_file_path: Path): # test deserialize d: dict = json.load(config_file) try: - doi_builder = DOIBuilder.from_dict(d=d) + doi_builder = DoiBuilder.from_dict(d=d) except Exception as e: assert False, f"Unable to build DOI Builder from config file. Error: {e}" # test serialization + new_d = doi_builder.as_dict() - print(new_d.keys()) - print(d.keys()) assert new_d.keys() == d.keys() - new_osti = OSTIDOIRecordModel.parse_obj(new_d["osti"]) + new_osti = OSTIModel.parse_obj(new_d["osti"]) new_elsevier = ConnectionModel.parse_obj(new_d["elsevier"]) new_materials_connection = MongoConnectionModel.parse_obj( new_d["materials_collection"] @@ -39,7 +39,7 @@ def test_builder_serialization(config_file_path: Path): new_d["robocrys_collection"] ) - true_osti = OSTIDOIRecordModel.parse_obj(d["osti"]) + true_osti = OSTIModel.parse_obj(d["osti"]) true_elsevier = ConnectionModel.parse_obj(new_d["elsevier"]) true_materials_connection = MongoConnectionModel.parse_obj( new_d["materials_collection"] @@ -59,4 +59,4 @@ def test_builder_serialization(config_file_path: Path): assert ( new_dois_collection_connection.dict() == true_dois_collection_connection.dict() ) - assert new_robocrys_collection_connection == true_robocrys_collection_connection \ No newline at end of file + assert new_robocrys_collection_connection == true_robocrys_collection_connection From 588c54ecd5068f2d5fa8ba16ed1317ea625ddd2a Mon Sep 17 00:00:00 2001 From: HugoOnghai Date: Tue, 24 Jun 2025 17:09:32 -0700 Subject: [PATCH 04/26] (after rebase) preliminary version of ELinkGetResponseModel made, using ELinkAPI RecordResponse --- .gitignore | 2 + legacy/mpcite/models.py | 302 ++++++---------------------------------- tests/test_elink_api.py | 59 ++++++++ 3 files changed, 107 insertions(+), 256 deletions(-) create mode 100644 tests/test_elink_api.py diff --git a/.gitignore b/.gitignore index 8241d4c..8ac3d15 100644 --- a/.gitignore +++ b/.gitignore @@ -209,6 +209,8 @@ __marimo__/ # Streamlit .streamlit/secrets.toml +*.json +.env json_pages/ notebooks/ diff --git a/legacy/mpcite/models.py b/legacy/mpcite/models.py index b2fab65..e4d055f 100644 --- a/legacy/mpcite/models.py +++ b/legacy/mpcite/models.py @@ -1,101 +1,65 @@ -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, ConfigDict from typing import List, Dict, Optional -from datetime import datetime +import datetime from enum import Enum import bibtexparser - - -class ConnectionModel(BaseModel): - endpoint: str = Field(..., title="URL Endpoint of the connection") - username: str = Field(..., title="User Name") - password: str = Field(..., title="Password") - - -class RoboCrysModel(BaseModel): - material_id: str - last_updated: datetime - description: Optional[str] = None - error: Optional[str] = None - - @classmethod - def get_default_description(cls): - return ( - "Computed materials data using density " - "functional theory calculations. These calculations determine " - "the electronic structure of bulk materials by solving " - "approximations to the Schrodinger equation. For more " - "information, see https://materialsproject.org/docs/calculations" - ) - - -class MaterialModel(BaseModel): - last_updated: datetime = Field( - None, title="timestamp for the most recent calculation" - ) - updated_at: datetime = Field(None, title="alternative to last_updated") - created_at: datetime = Field( - None, - description="creation time for this material defined by when the first structure " - "optimization calculation was run", - ) - task_id: str = Field( - "", title="task id for this material. Also called the material id" - ) - # pretty_formula: str = Field(..., title="clean representation of the formula") - pretty_formula: str = Field(..., title="clean representation of the formula") - chemsys: str - +from elinkapi import Elink, Record +from elinkapi.record import RecordResponse, AccessLimitation, JournalType +from elinkapi.geolocation import Geolocation +from elinkapi.identifier import Identifier +from elinkapi.related_identifier import RelatedIdentifier +from elinkapi.person import Person +from elinkapi.organization import Organization + +class TestClass(RecordResponse): + ... + # stuff class ELinkGetResponseModel(BaseModel): - osti_id: Optional[str] = Field(...) + osti_id: Optional[int] = Field(...) dataset_type: str = Field(default="SM") title: str = Field(...) - creators: str = Field(default="Kristin Persson") # replace with authors + persons: List[Person] contributors: List[Dict[str, str]] = Field( default=[{"first_name": "Materials", "last_name": "Project"}], description="List of Dict of first name, last name mapping", ) # no contributor - product_nos: str = Field(..., title="MP id") - accession_num: str = Field(..., title="MP id") - contract_nos: str = Field("AC02-05CH11231; EDCBEE") - originating_research_org: str = Field( - default="Lawrence Berkeley National Laboratory (LBNL), Berkeley, CA (United States)" - ) - publication_date: str = Field(...) - language: str = Field(default="English") - country: str = Field(default="US") - sponsor_org: str = Field( - default="USDOE Office of Science (SC), Basic Energy Sciences (BES) (SC-22)" - ) + publication_date: datetime.date site_url: str = Field(...) - contact_name: str = Field(default="Kristin Persson") - contact_org: str = Field(default="LBNL") - contact_email: str = Field(default="feedback@materialsproject.org") - contact_phone: str = Field(default="+1(510)486-7218") - related_resource: str = Field("https://materialsproject.org/citing") - contributor_organizations: str = Field(default="MIT; UC Berkeley; Duke; U Louvain") - subject_categories_code: str = Field(default="36 MATERIALS SCIENCE") - keywords: str = Field(...) - description: str = Field(default="") doi: dict = Field( {}, title="DOI info", description="Mainly used during GET request" ) + mp_id: str | None = None + keywords: List[str] = None @classmethod - def get_title(cls, material: MaterialModel): - formula = material.pretty_formula + def from_elinkapi_record(cls, R): + gotResponse = ELinkGetResponseModel( + osti_id = R.osti_id, + title = R.title, + persons = R.persons, + # assume default contributors for now, creators vs contributors? + publication_date = R.publication_date, + site_url = R.site_url, + doi = {"doi": R.doi}, + mp_id = next((id.value for id in R.identifiers if id.type == 'RN'), None), + keywords = R.keywords + ) + + return gotResponse + + def get_title(self): + formula = self.keywords[1] return "Materials Data on %s by Materials Project" % formula - @classmethod - def get_site_url(cls, mp_id): - return "https://materialsproject.org/materials/%s" % mp_id + def get_site_url(self): + return "https://materialsproject.org/materials/%s" % self.mp_id - @classmethod - def get_keywords(cls, material): - keywords = "; ".join( - ["crystal structure", material.pretty_formula, material.chemsys] - ) - return keywords + def get_keywords(self): + # keywords = "; ".join( + # ["crystal structure", material.pretty_formula, material.chemsys] + # ) + return self.keywords @classmethod def get_default_description(cls): @@ -113,11 +77,11 @@ def custom_to_dict(cls, elink_record) -> dict: return elink_record.dict(exclude={"osti_id", "doi"}) else: return elink_record.dict(exclude={"doi"}) - + class ElinkResponseStatusEnum(Enum): - SUCCESS = "SUCCESS" - FAILED = "FAILURE" + SUCCESS = "SUCCESS" + FAILED = "FAILURE" class ELinkPostResponseModel(BaseModel): @@ -142,178 +106,4 @@ def generate_doi_record(self): ) doi_collection_record.set_status(status=self.doi["@status"]) doi_collection_record.last_validated_on = datetime.now() - return doi_collection_record - - -class DOIRecordStatusEnum(str, Enum): - COMPLETED = "COMPLETED" - PENDING = "PENDING" - FAILURE = "FAILURE" - INIT = "INIT" - - -class DOIRecordModel(BaseModel): - material_id: str = Field(...) - doi: str = Field(default="") - bibtex: Optional[str] = None - status: DOIRecordStatusEnum - valid: bool = Field(False) - last_updated: datetime = Field( - default=datetime.now(), - title="DOI last updated time.", - description="Last updated is defined as either a Bibtex or status change.", - ) - created_at: datetime = Field( - default=datetime.now(), - title="DOI Created At", - description="creation time for this DOI record", - ) - last_validated_on: datetime = Field( - default=datetime.now(), - title="Date Last Validated", - description="Date that this data is last validated, " "not necessarily updated", - ) - elsevier_updated_on: datetime = Field( - default=datetime.now(), - title="Date Elsevier is updated", - description="If None, means never uploaded to elsevier", - ) - error: Optional[str] = Field( - default=None, description="None if no error, else error message" - ) - - class Config: - use_enum_values = True - - def set_status(self, status): - self.status = status - - def get_osti_id(self): - if self.doi is None or self.doi == "": - return "" - else: - return self.doi.split("/")[-1] - - def get_bibtex_abstract(self): - try: - if self.bibtex is None: - return "" - bib_db: bibtexparser.bibdatabase.BibDatabase = bibtexparser.loads( - self.bibtex - ) - if bib_db.entries: - return bib_db.entries[0]["abstractnote"] - except Exception as e: - print(e) - return "" - - -class OSTIDOIRecordModel(DOIRecordModel): - material_id: str = Field(...) - doi: str = Field(default="") - bibtex: Optional[str] = None - valid: bool = Field(False) - last_updated: datetime = Field( - default=datetime.now(), - title="DOI last updated time.", - description="Last updated is defined as either a Bibtex or status change.", - ) - - -class ElsevierPOSTContainerModel(BaseModel): - identifier: str = Field(default="", title="mp_id") - source: str = "MATERIALS_PROJECT" - date: str = datetime.now().date().isoformat().__str__() - title: str - description: str = "" - doi: str - authors: List[str] = ["Kristin Persson"] - url: str - type: str = "dataset" - dateAvailable: str = datetime.now().date().isoformat().__str__() - dateCreated: str = datetime.now().date().isoformat().__str__() - version: str = "1.0.0" - funding: str = "USDOE Office of Science (SC), Basic Energy Sciences (BES) (SC-22)" - language: str = "en" - method: str = "Materials Project" - accessRights: str = "Public" - contact: str = "Kristin Persson " - dataStandard: str = "https://materialsproject.org/citing" - howToCite: str = "https://materialsproject.org/citing" - subjectAreas: List[str] = ["36 MATERIALS SCIENCE"] - keywords: List[str] - institutions: List[str] = ["Lawrence Berkeley National Laboratory"] - institutionIds: List[str] = ["AC02-05CH11231; EDCBEE"] - spatialCoverage: List[str] = [] - temporalCoverage: List[str] = [] - references: List[str] = ["https://materialsproject.org/citing"] - relatedResources: List[str] = ["https://materialsproject.org/citing"] - location: str = "1 Cyclotron Rd, Berkeley, CA 94720" - childContainerIds: List[str] = [] - - @classmethod - def get_url(cls, mp_id): - return "https://materialsproject.org/materials/%s" % mp_id - - @classmethod - def get_keywords(cls, material: MaterialModel): - return ["crystal structure", material.pretty_formula, material.chemsys] - - @classmethod - def get_default_description(cls): - return ( - "Computed materials data using density " - "functional theory calculations. These calculations determine " - "the electronic structure of bulk materials by solving " - "approximations to the Schrodinger equation. For more " - "information, see https://materialsproject.org/docs/calculations" - ) - - @classmethod - def get_date_created(cls, material: MaterialModel) -> str: - return material.created_at.date().__str__() - - @classmethod - def get_date_available(cls, material: MaterialModel) -> str: - return material.created_at.date().__str__() - - @classmethod - def get_title(cls, material: MaterialModel) -> str: - return material.pretty_formula - - @classmethod - def from_material_model(cls, material: MaterialModel, doi: str, description: str): - model = ElsevierPOSTContainerModel( - identifier=material.task_id, - title=material.pretty_formula, - doi=doi, - url="https://materialsproject.org/materials/%s" % material.task_id, - keywords=["crystal structure", material.pretty_formula, material.chemsys], - date=datetime.now().date().__str__(), - dateCreated=material.created_at.date().__str__(), - dateAvailable=ElsevierPOSTContainerModel.get_date_available(material), - description=description, - ) - return model - - -class ExplorerGetJSONResponseModel(BaseModel): - osti_id: str - title: str - report_number: str - doi: str - product_type: str - language: str - country_publication: str - description: str - site_ownership_code: str - publication_date: str - entry_date: str - contributing_organizations: str - authors: List[str] - subjects: List[str] - contributing_org: str - doe_contract_number: str - sponsor_orgs: List[str] - research_orgs: List[str] - links: List[Dict[str, str]] + return doi_collection_record \ No newline at end of file diff --git a/tests/test_elink_api.py b/tests/test_elink_api.py new file mode 100644 index 0000000..d1ade7d --- /dev/null +++ b/tests/test_elink_api.py @@ -0,0 +1,59 @@ +import os +from dotenv import load_dotenv + +from elinkapi import Elink, Record, exceptions +import pytest +from mpcite.models import ELinkGetResponseModel, TestClass + +from pymongo import MongoClient + + +load_dotenv() + +atlas_user = os.environ.get("atlas_user") +atlas_password = os.environ.get("atlas_password") +atlas_host = os.environ.get("atlas_host") +mongo_uri = f"mongodb+srv://{atlas_user}:{atlas_password}@{atlas_host}/" + +api = Elink(token=os.environ.get("elink_api_key")) # target default is production E-link service. + +record = api.get_single_record(1190959) +type(record) + +ELinkGotRecordModel = ELinkGetResponseModel.from_elinkapi_record(record) + +print(ELinkGotRecordModel.get_title()) +print(ELinkGotRecordModel.get_site_url()) +print(ELinkGotRecordModel.get_keywords()) +print(ELinkGotRecordModel.get_default_description()) + + + +ELinkTestGetRecordModel = TestClass(**record.model_dump()) + +with MongoClient(mongo_uri) as client: + #get all material_ids and dois from doi collection + doi_collection = client["mp_core"]["dois"] + materials_to_update = list(doi_collection.find({}, {"_id": 0, "material_id": 1, "doi": 1}, limit=10)) + material_ids = [entry["material_id"] for entry in materials_to_update] + + # check # of material_ids from DOI collection vs amount in robocrys + + # get description for material_ids from robocrys collection + coll = client["mp_core_blue"]["robocrys"] + res = list(coll.find({"material_id": {"$in": material_ids}}, {"_id": 0, "material_id": 1, "description": 1})) + + # join on material_id + for doc in res: + mat = next(filter(lambda x: x["material_id"] == doc["material_id"], materials_to_update)) + doc["doi"] = mat["doi"] + + +# {"material_id": ..., "doi": ..., "description": ...} -> +# Record( +# template_fields ..., +# doi: ..., +# description: ..., +# fields_where_material_id_makes_sense: ..., +# ) + From 06a3a7d3df1e24687675346b50bbd41c8a71ccf1 Mon Sep 17 00:00:00 2001 From: HugoOnghai Date: Thu, 26 Jun 2025 18:12:34 -0700 Subject: [PATCH 05/26] (after rebase) queried all desired data entries (stored as batched json files) on ELink found bug with rows greater than 100 on ElinkAPI query_records (144845 dois under 10.17188, 12 are not titled Materials Data On... (edge cases), 144833 Materials have DOIs) --- .gitignore | 1 + tests/manage_backfills.py | 49 ++++++++++++++++ tests/outputs.txt | 46 +++++++++++++++ tests/prod_to_review.py | 120 ++++++++++++++++++++++++++++++++++++++ tests/test_elink_api.py | 96 ++++++++++++++++++++++-------- 5 files changed, 289 insertions(+), 23 deletions(-) create mode 100644 tests/manage_backfills.py create mode 100644 tests/outputs.txt create mode 100644 tests/prod_to_review.py diff --git a/.gitignore b/.gitignore index 8ac3d15..128afd8 100644 --- a/.gitignore +++ b/.gitignore @@ -211,6 +211,7 @@ __marimo__/ .streamlit/secrets.toml *.json .env +>>>>>>> 7bb812e (preliminary version of ELinkGetResponseModel made, using ELinkAPI RecordResponse) json_pages/ notebooks/ diff --git a/tests/manage_backfills.py b/tests/manage_backfills.py new file mode 100644 index 0000000..a835456 --- /dev/null +++ b/tests/manage_backfills.py @@ -0,0 +1,49 @@ +# This script will see how many documents in ELink, i.e. ones with a DOI, are not accounted for in the internal DOI collection. + +from elinkapi import Elink, Query, Record + +import os +from dotenv import load_dotenv + +load_dotenv() # depends on the root directory from which you run your python scripts. + +api = Elink(token = os.environ.get("elink_api_PRODUCTION_key")) + + +query1 = api.query_records(rows=1000) + +materials_with_dois : list[Record] = [] + +for page in query1: + print(f"Now on Page: {page.title}") + print(f"Material_ID: {page.site_unique_id} and DOI: http://doi.org/{page.doi}") + + if page.site_unique_id.startswith("mp-"): + materials_with_dois.append(page) + + # for record in page.data: + # if record.site_unique_id.startswith("mp-"): + # materials_with_dois.append(record) + + + +# set_q1 = [page for page in query1] +# set_q2 = [page for page in query2] + +# set_diffq1q2 = set(set_q1) - set(set_q2) +# print (f"Difference matched {len(set)} records") + +# filtered = [ +# page for page in query1 +# if page.title.lower().startswith("materials data on") +# ] + +# print (f"Filtered Query1 has {len(filtered)} records") + +# paginate through ALL results +# for page in query1: +# print(page.title) +# print(f"Material_ID: {page.site_unique_id} and DOI: http://doi.org/{page.doi}") + +# for record in page.data: +# print (f"OSTI ID: {record.osti_id} Title: {record.title}") \ No newline at end of file diff --git a/tests/outputs.txt b/tests/outputs.txt new file mode 100644 index 0000000..8d188e7 --- /dev/null +++ b/tests/outputs.txt @@ -0,0 +1,46 @@ +(mpcite-env) C:\Users\ongha\OneDrive\Documents\GitHub\MPCite>C:/Users/ongha/anaconda3/envs/mpcite-env/python.exe c:/Users/ongha/OneDrive/Documents/GitHub/MPCite/tests/prod_to_review.py + +Query retrieved 144845 record(s) +Page finished. Now at 500 data entries. 0 edge cases found. +Page finished. Now at 1000 data entries. 0 edge cases found. +Page finished. Now at 1500 data entries. 0 edge cases found. +Page finished. Now at 2000 data entries. 0 edge cases found. +Page finished. Now at 2500 data entries. 0 edge cases found. +Page finished. Now at 3000 data entries. 0 edge cases found. +Page finished. Now at 3500 data entries. 0 edge cases found. +Page finished. Now at 4000 data entries. 0 edge cases found. +Page finished. Now at 4500 data entries. 0 edge cases found. +Page finished. Now at 5000 data entries. 0 edge cases found. +Page finished. Now at 5500 data entries. 0 edge cases found. +Page finished. Now at 6000 data entries. 0 edge cases found. +Page finished. Now at 6500 data entries. 0 edge cases found. +Page finished. Now at 7000 data entries. 0 edge cases found. +Page finished. Now at 7500 data entries. 0 edge cases found. +Page finished. Now at 8000 data entries. 0 edge cases found. +Page finished. Now at 8500 data entries. 0 edge cases found. +Page finished. Now at 9000 data entries. 0 edge cases found. +Page finished. Now at 9500 data entries. 0 edge cases found. +Page finished. Now at 10000 data entries. 0 edge cases found. +Page finished. Now at 10500 data entries. 0 edge cases found. +Page finished. Now at 11000 data entries. 0 edge cases found. +Page finished. Now at 11500 data entries. 0 edge cases found. +Page finished. Now at 12000 data entries. 0 edge cases found. +Page finished. Now at 12500 data entries. 0 edge cases found. +Page finished. Now at 13000 data entries. 0 edge cases found. +Page finished. Now at 13500 data entries. 0 edge cases found. +Page finished. Now at 14000 data entries. 0 edge cases found. +Page finished. Now at 14500 data entries. 0 edge cases found. + +Traceback (most recent call last): + File "C:\Users\ongha\anaconda3\envs\mpcite-env\Lib\site-packages\elinkapi\query.py", line 95, in __next__ + record = self.data.pop() +IndexError: pop from empty list + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "c:\Users\ongha\OneDrive\Documents\GitHub\MPCite\tests\prod_to_review.py", line 29, in + record = next(query) + File "C:\Users\ongha\anaconda3\envs\mpcite-env\Lib\site-packages\elinkapi\query.py", line 108, in __next__ + raise StopIteration +StopIteration \ No newline at end of file diff --git a/tests/prod_to_review.py b/tests/prod_to_review.py new file mode 100644 index 0000000..87e311d --- /dev/null +++ b/tests/prod_to_review.py @@ -0,0 +1,120 @@ +from elinkapi import Elink, Query, Record + +import os +from dotenv import load_dotenv + +import json + +load_dotenv() # depends on the root directory from which you run your python scripts. + +review_endpoint = "https://review.osti.gov/elink2api/" + +prod_api = Elink(token = os.environ.get("elink_api_PRODUCTION_key")) +review_api = Elink(token = os.environ.get("elink_review_api_token"), target=review_endpoint) + +print(prod_api.query_records()) + +rows_per_page = 100 + +# query production +query = prod_api.query_records(rows=rows_per_page) +print(f"Query retrieved {query.total_rows} record(s)") + +count_materials_data = 0 +count_MaterialsDataOn = 0 +cwd = os.getcwd() +page_number = 0 +page_json_list = [] + +for record in query: + # increment counter + count_materials_data = count_materials_data + 1 + print(f"On record #{count_materials_data}, next url is {query.next_url}, previous url is {query.previous_url}") + + # see if the record is a Materials Data on record + if record.title.startswith("Materials Data on"): + # increment the MaterialsDataOn counter + count_MaterialsDataOn = count_MaterialsDataOn + 1 + + # prepare the new record for the review environment, remove the OSTI ID, and add its model_dump to the list of json objects for the page. + new_record = record + new_record_dict = new_record.model_dump(exclude_none=True) + + new_record_osti_id = new_record_dict.pop("osti_id") # now new_record_dict does not have the osti_id key. + js = json.dumps(new_record_dict, default=str) # datetime objects are not JSON serializable, so we use default=str to convert them to strings. + + page_json_list.append(js) + + # TODO: take the new_record_dict and make it into a new post to the review environment and save the RecordResponse. + + else: + print(f"Found edge case: {record.title}") + + if count_materials_data % rows_per_page == 0: + # create/open, write, and close new json file + page_number = count_materials_data / rows_per_page + path = f'/json_pages/page_number_{page_number}' + fp = open(cwd+path, 'a') + + for js in page_json_list: + fp.write(js) + fp.write("\n") + + fp.close() + page_json_list = [] + + print(f"Page {page_number} finished. Now at {count_materials_data} data entries. {count_materials_data - count_MaterialsDataOn} edge cases found.") + +# print remainder of records if not a full page after for loop exits +page_number = page_number + 1 +path = f'/json_pages/page_number_{page_number}' +fp = open(cwd+path, 'a') +for js in page_json_list: + fp.write(js) + fp.write("\n") +fp.close() + +# # if contains materials data on, then add to batch +# for count_materials_data < query.total_rows: + +# # print(f"The length of the query is now {len(query.data)}") +# record = next(query) +# count_materials_data = count_materials_data + 1 + +# if record.title.startswith("Materials Data on"): +# count_MaterialsDataOn = count_MaterialsDataOn + 1 + +# new_record = record +# new_record_dict = new_record.model_dump(exclude_none=True) + +# new_record_osti_id = new_record_dict.pop("osti_id") + +# page_dict[f"Entry OSTI_ID {new_record_osti_id}"] = new_record_dict + +# # TODO: take the new_record_dict and make it into a new post to the review environment and save the RecordResponse. + + + +# if count_materials_data % rows_per_page == 0: +# # if a page has been fully consummed, then print the new batched dictionary to a json file. + +# js = json.dumps(page_dict, default=str) + +# # open new json file if not exist it will create +# cwd = os.getcwd() +# path = f'/json_pages/page_number_{count_materials_data/rows_per_page}' +# fp = open(cwd+path, 'a') + +# # write to json file +# fp.write(js) + +# # close the connection to the file and empty the dict +# fp.close() +# page_dict = {} + +# print(f"Page {(count_materials_data / rows_per_page)} finished. Now at {count_materials_data} data entries. {count_materials_data - count_MaterialsDataOn} edge cases found.") + +# model_dump exclude_none=True, remove null keys +# pop osti_id --> save batch to json files +# make new record +# post to review_api diff --git a/tests/test_elink_api.py b/tests/test_elink_api.py index d1ade7d..80afba7 100644 --- a/tests/test_elink_api.py +++ b/tests/test_elink_api.py @@ -6,7 +6,7 @@ from mpcite.models import ELinkGetResponseModel, TestClass from pymongo import MongoClient - +import pymongo load_dotenv() @@ -15,38 +15,41 @@ atlas_host = os.environ.get("atlas_host") mongo_uri = f"mongodb+srv://{atlas_user}:{atlas_password}@{atlas_host}/" -api = Elink(token=os.environ.get("elink_api_key")) # target default is production E-link service. +api = Elink(token=os.environ.get("elink_api_PRODUCTION_key")) # target default is production E-link service. -record = api.get_single_record(1190959) -type(record) +### Grabbing an existing record -ELinkGotRecordModel = ELinkGetResponseModel.from_elinkapi_record(record) +# record = api.get_single_record(mp-id) # test for silicon -print(ELinkGotRecordModel.get_title()) -print(ELinkGotRecordModel.get_site_url()) -print(ELinkGotRecordModel.get_keywords()) -print(ELinkGotRecordModel.get_default_description()) +# type(record) +# ELinkGotRecordModel = ELinkGetResponseModel.from_elinkapi_record(record) +# print(ELinkGotRecordModel.get_title()) +# print(ELinkGotRecordModel.get_site_url()) +# print(ELinkGotRecordModel.get_keywords()) +# print(ELinkGotRecordModel.get_default_description()) -ELinkTestGetRecordModel = TestClass(**record.model_dump()) +# ELinkTestGetRecordModel = TestClass(**record.model_dump()) -with MongoClient(mongo_uri) as client: - #get all material_ids and dois from doi collection - doi_collection = client["mp_core"]["dois"] - materials_to_update = list(doi_collection.find({}, {"_id": 0, "material_id": 1, "doi": 1}, limit=10)) - material_ids = [entry["material_id"] for entry in materials_to_update] +### Making a new record + +# with MongoClient(mongo_uri) as client: +# #get all material_ids and dois from doi collection +# doi_collection = client["mp_core"]["dois"] +# materials_to_update = list(doi_collection.find({}, {"_id": 0, "material_id": 1, "doi": 1}, limit=10)) +# material_ids = [entry["material_id"] for entry in materials_to_update] - # check # of material_ids from DOI collection vs amount in robocrys +# # check # of material_ids from DOI collection vs amount in robocrys - # get description for material_ids from robocrys collection - coll = client["mp_core_blue"]["robocrys"] - res = list(coll.find({"material_id": {"$in": material_ids}}, {"_id": 0, "material_id": 1, "description": 1})) +# # get description for material_ids from robocrys collection +# coll = client["mp_core_blue"]["robocrys"] +# res = list(coll.find({"material_id": {"$in": material_ids}}, {"_id": 0, "material_id": 1, "description": 1})) - # join on material_id - for doc in res: - mat = next(filter(lambda x: x["material_id"] == doc["material_id"], materials_to_update)) - doc["doi"] = mat["doi"] +# # join on material_id +# for doc in res: +# mat = next(filter(lambda x: x["material_id"] == doc["material_id"], materials_to_update)) +# doc["doi"] = mat["doi"] # {"material_id": ..., "doi": ..., "description": ...} -> @@ -57,3 +60,50 @@ # fields_where_material_id_makes_sense: ..., # ) +# with the client open +with MongoClient(mongo_uri) as client: + # get all dois from the collection + doi_collection = client["mp_core"]["dois"] + materials_to_update = list(doi_collection.find({}, {"_id": 0, "doi": 1, "material_id": 1}, limit=2)) + + # from the doi collection, grab the material_id and doi of each material + material_ids = [entry["material_id"] for entry in materials_to_update] + + # additionally, gain the osti id from the doi + osti_ids = [entry["doi"].split("10.17188/")[1] for entry in materials_to_update] + + # additionally, grab the description of each material from the robocrys + coll = client["mp_core_blue"]["robocrys"] # grabs robocrys collection from active database + res = list(coll.find({"material_id": {"$in": material_ids}}, {"_id": 0, "material_id": 1, "description": 1})) # grabs the material id and description of entries in the collection + descriptions = [entry["description"] for entry in res] + + # for each material (and its material_id, doi, and osti_id) + for i in range(len(materials_to_update)): + internal_material_id = material_ids[i] + internal_osti_id = osti_ids[i] + internal_description = descriptions[i] + + # get_single_record(osti_id) + record = api.get_single_record(internal_osti_id) + + print(f"\n \n \nPrinting what is currently on ELINK for {internal_material_id}*****************************************") + print(record) + + if internal_material_id == record.site_unique_id: + # update description + record.description = "testTESTtestTESTtest" + + print(f"\n \n \nPrinting record for {internal_material_id}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") + print(record) + + # # post updated record + # try: + # saved_record = api.post_new_record(record, "save") + # except exceptions.BadRequestException as ve: + # ... + # # ve.message = "Site Code AAAA is not valid." + # # ve.errors provides more details: + # # [{"status":"400", "detail":"Site Code AAAA is not valid.", "source":{"pointer":"site_ownership_code"}}] + + + From 4d1f471694317c7bf4557787d807794bc5bf86ea Mon Sep 17 00:00:00 2001 From: HugoOnghai Date: Thu, 3 Jul 2025 10:49:34 -0700 Subject: [PATCH 06/26] (after rebase) almost done with pipeline, need to incorporate pymongo steps + new doi collection --- legacy/mpcite/pipeline.py | 84 ++++++++++++++++++ tests/file_to_jsonForUpload.py | 152 +++++++++++++++++++++++++++++++++ 2 files changed, 236 insertions(+) create mode 100644 legacy/mpcite/pipeline.py create mode 100644 tests/file_to_jsonForUpload.py diff --git a/legacy/mpcite/pipeline.py b/legacy/mpcite/pipeline.py new file mode 100644 index 0000000..67fe15e --- /dev/null +++ b/legacy/mpcite/pipeline.py @@ -0,0 +1,84 @@ +import os +import json +from elinkapi import Elink, Record +from dotenv import load_dotenv + +import requests +from elinkapi.utils import Validation + +from pymongo import MongoClient +import pymongo + +from timeit import default_timer as timer +import logging +import datetime +from doi_builder import * + +load_dotenv() # depends on the root directory from which you run your python scripts. + +review_endpoint = "https://review.osti.gov/elink2api/" + +prod_api = Elink(token = os.environ.get("elink_api_PRODUCTION_key")) +review_api = Elink(token = os.environ.get("elink_review_api_token"), target=review_endpoint) + +atlas_user = os.environ.get("atlas_user") +atlas_password = os.environ.get("atlas_password") +atlas_host = os.environ.get("atlas_host") +mongo_uri = f"mongodb+srv://{atlas_user}:{atlas_password}@{atlas_host}/" + +failed_osti_ids = [] + +query = prod_api.query_records() + +# for every record in the OSTI production environment: +for record in query: + # flag for update performance + update_success = False + + material_id = record.site_unique_id + + with MongoClient(mongo_uri) as client: + coll = client["mp_core_blue"]["robocrys"] + res = coll.find_one({"material_id" : material_id}) + + if res != None: + robocrys_description = res["description"] + + # what if there is no document in robocrys found? + + # if the description of the record on Elink doesnt match what is in the robocrys collection: + if record.description != robocrys_description: + # directly update the description of the record via the record response + record.description = robocrys_description + + # and directly update the identifier for sponsoring org + for entry in record.organizations: + if entry["type"] == "SPONSOR": + entry["identifiers"] = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] + + try: + # send update to the record with the record response # update_record(osti_id, record, state="save") + record_response = prod_api.update_record(record.osti_id, record, state="save") + update_success = True + + except: + logging.debug("The update failed to save!") + # add the osti_id of the failed update to failed_osti_ids + failed_osti_ids.append(record.osti_id) + + # if the update worked... + if update_success == True: + # save the record response returned with sending the update, done above + # convert that record response into a doi_model + doi_model = RecordResponse_to_doi_model(record_response) + + # upload that doi_model as a document to the new doi collection in mp_core + upload_doi_document_model_to_collection(doi_model, MongoClient, collection_name) + +# else if the description on Elink matches what is in the robocrys collection: +# convert that record into a doi_model +# upload that doi_model as a document to the new doi collection in mp_core, no updated needed! + +with open(f"failed_osti_ids_{str(datetime.datetime.now())}.txt", 'w') as output: # change filepath as needed + for id in failed_osti_ids: + output.write(str(id) + '\n') # i'm pretty sure it's a string already though... \ No newline at end of file diff --git a/tests/file_to_jsonForUpload.py b/tests/file_to_jsonForUpload.py new file mode 100644 index 0000000..aaa0886 --- /dev/null +++ b/tests/file_to_jsonForUpload.py @@ -0,0 +1,152 @@ +import os +import json +from elinkapi import Elink, Record +from dotenv import load_dotenv + +import requests +from elinkapi.utils import Validation + +from pymongo import MongoClient +import pymongo + +from timeit import default_timer as timer + +load_dotenv() # depends on the root directory from which you run your python scripts. + +review_endpoint = "https://review.osti.gov/elink2api/" + +prod_api = Elink(token = os.environ.get("elink_api_PRODUCTION_key")) +review_api = Elink(token = os.environ.get("elink_review_api_token"), target=review_endpoint) + + +print(review_api.get_single_record(2525340)) +raise + +atlas_user = os.environ.get("atlas_user") +atlas_password = os.environ.get("atlas_password") +atlas_host = os.environ.get("atlas_host") +mongo_uri = f"mongodb+srv://{atlas_user}:{atlas_password}@{atlas_host}/" + +cwd = os.getcwd() +path = "/json_pages/page_number_4.0" # IT'S ONLY DOING ONE FILE RIGHT NOW LOL +file = open(cwd + path, "r") + +update_counter = 0 +records_checked = 0 + +def delete_record(api, osti_id, reason): + """Delete a record by its OSTI ID.""" + response = requests.delete(f"{api.target}records/{osti_id}?reason={reason}", headers={"Authorization": f"Bearer {api.token}"}) + Validation.handle_response(response) + return response.status_code == 204 # True if deleted successfully + +def emptyReviewAPI(reason): + allDeleted = True + for record in review_api.query_records(): + delete_record(review_api, record.osti_id, reason) + +start = timer() + +# Post an updated json + +postUnedited = False + +for line in file: + js = json.loads(line.strip()) + + for entry in js["organizations"]: + if entry["type"] == "SPONSOR": + entry["identifiers"] = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] + + material_id = js["site_unique_id"] + + robocrys_description = js["description"] + + with MongoClient(mongo_uri) as client: + coll = client["mp_core_blue"]["robocrys"] + res = coll.find_one({"material_id" : material_id}) + records_checked += 1 + + if res != None: + robocrys_description = res["description"] + + # see if an update to the description is necessary, if it is, then update the description and post a new record. + if postUnedited or (robocrys_description != None and js["description"] != robocrys_description): #if a robocrys_description was found internally and it doesn't match what ELink has record... + js["description"] = "OLD WAS UPDATED, THEN IT WAS POSTED: " + robocrys_description + my_record = Record(**js) + + saved_record = None + try: + # The API will now return an error code on this call + # because "AAAA" is not a valid site_ownership_code + + saved_record = review_api.post_new_record(my_record, state="submit") + update_counter += 1 + + print(f"NEW RECORD POSTED: {saved_record.osti_id}") + raise + except: + print(f"Record failed to post!: {my_record.doi}. Robocrys Collection Had Description {robocrys_description[0:50]}... Prod_Env ELink Had {my_record.description[37:87]}...") + raise + + if update_counter >= 10000: + break + +end = timer() +print(f"Records Updated and/or Posted: {update_counter} \nRecords Checked in Total: {records_checked}. \nIt took {end - start} seconds") + +####################################################### +# JUST POST JSON, Then update posted json Later +# post_counter = 0 +# records_checked = 0 + +# for line in file: +# js = json.loads(line.strip()) + +# material_id = js["site_unique_id"] + +# # always post, no update +# my_record = Record(**js) + +# saved_record = None +# try: +# # The API will now return an error code on this call +# # because "AAAA" is not a valid site_ownership_code + +# # posts an unupdated record +# saved_record = review_api.post_new_record(my_record, "save") +# post_counter += 1 + +# print("\n\n NEW RECORD POSTED") +# print(saved_record) + +# robocrys_description = js["description"] + +# with MongoClient(mongo_uri) as client: +# coll = client["mp_core_blue"]["robocrys"] +# res = coll.find_one({"material_id" : material_id}) +# records_checked += 1 + +# if res != None: +# robocrys_description = res["description"] + +# if robocrys_description != None and js["description"] != robocrys_description: # if an update is needed +# # update the js["description"] +# js["description"] = "OLD WAS POSTED, THEN RECORD WITH NEW DESCRIPTION UPDATED IT: " + robocrys_description + +# # turn it into a new record +# new_updated_record = Record(**js) + +# # use that new record to update what was just posted +# review_api.update_record(saved_record.osti_id, new_updated_record, "save") + +# except: +# print("Record failed to post!") + +# if post_counter >= 10000: +# break + +# end = timer() +# print(f"Records Updated and/or Posted: {update_counter} \n Records Checked in Total: {records_checked}. It took {end - start} seconds") + +###################################################### \ No newline at end of file From a8172c93dd71ced6ba4a55631d5a99ce62c99fdb Mon Sep 17 00:00:00 2001 From: Tyler Mathis <35553152+tsmathis@users.noreply.github.com> Date: Thu, 3 Jul 2025 12:15:20 -0700 Subject: [PATCH 07/26] (after rebase) template for core --- legacy/mpcite/core.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 legacy/mpcite/core.py diff --git a/legacy/mpcite/core.py b/legacy/mpcite/core.py new file mode 100644 index 0000000..1159b66 --- /dev/null +++ b/legacy/mpcite/core.py @@ -0,0 +1,36 @@ +from typing import TypeAlias + +from elinkapi import Elink +from elinkapi.record import RecordResponse +from pymongo import MongoClient + +OstiID: TypeAlias = int + + +def find_out_of_date_records( + client: MongoClient, + robocrys_db: str, + robocrys_collection: str, + doi_db: str, + doi_collection, +) -> list[OstiID]: + robocrys = client.robocrys_db.robocrys_collection + doi = client.doi_db.doi_collection + + out_of_data_osti_ids = [] + + # robocrys docs newer than in doi + + return out_of_data_osti_ids + + +def update_existing_osti_record(*args, **kwargs) -> RecordResponse: ... + + +def submit_new_osti_record(*args, **kwargs) -> RecordResponse: ... + + +def update_state_of_osti_record(*args, **kwargs) -> RecordResponse: ... + + +def delete_osti_record(*args, **kwargs) -> RecordResponse: ... From 128627993dc1558371dd1df2af724e6c8e9bdf70 Mon Sep 17 00:00:00 2001 From: HugoOnghai Date: Thu, 3 Jul 2025 13:56:05 -0700 Subject: [PATCH 08/26] (after rebase) push latest doi_builder and other thigns --- legacy/mpcite/pipeline.py | 117 ++++++++++++++---------- mpcite/doi_builder.py | 186 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 255 insertions(+), 48 deletions(-) create mode 100644 mpcite/doi_builder.py diff --git a/legacy/mpcite/pipeline.py b/legacy/mpcite/pipeline.py index 67fe15e..869e65f 100644 --- a/legacy/mpcite/pipeline.py +++ b/legacy/mpcite/pipeline.py @@ -1,6 +1,7 @@ import os import json from elinkapi import Elink, Record +from elinkapi.record import RecordResponse from dotenv import load_dotenv import requests @@ -28,57 +29,77 @@ failed_osti_ids = [] -query = prod_api.query_records() +cwd = os.getcwd() +path = "/json_pages/" -# for every record in the OSTI production environment: -for record in query: - # flag for update performance - update_success = False +for filename in os.listdir(cwd+path): + logging.debug(f"Now extracting {filename}") + file = open(cwd + path + filename, "r") + for line in file: + record = RecordResponse(**json.loads(line.strip())) + record.osti_id = record.doi.split('/')[1] + # for every record in the OSTI production environment: + # flag for update performance + update_success = False - material_id = record.site_unique_id + material_id = record.site_unique_id - with MongoClient(mongo_uri) as client: - coll = client["mp_core_blue"]["robocrys"] - res = coll.find_one({"material_id" : material_id}) - - if res != None: - robocrys_description = res["description"] - - # what if there is no document in robocrys found? - - # if the description of the record on Elink doesnt match what is in the robocrys collection: - if record.description != robocrys_description: - # directly update the description of the record via the record response - record.description = robocrys_description + with MongoClient(mongo_uri) as client: # should I open this in or outside of the for loop? + coll = client["mp_core_blue"]["robocrys"] + res = coll.find_one({"material_id" : material_id}) - # and directly update the identifier for sponsoring org - for entry in record.organizations: - if entry["type"] == "SPONSOR": - entry["identifiers"] = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] - - try: - # send update to the record with the record response # update_record(osti_id, record, state="save") - record_response = prod_api.update_record(record.osti_id, record, state="save") - update_success = True - - except: - logging.debug("The update failed to save!") - # add the osti_id of the failed update to failed_osti_ids - failed_osti_ids.append(record.osti_id) - - # if the update worked... - if update_success == True: - # save the record response returned with sending the update, done above - # convert that record response into a doi_model - doi_model = RecordResponse_to_doi_model(record_response) - - # upload that doi_model as a document to the new doi collection in mp_core - upload_doi_document_model_to_collection(doi_model, MongoClient, collection_name) - -# else if the description on Elink matches what is in the robocrys collection: -# convert that record into a doi_model -# upload that doi_model as a document to the new doi collection in mp_core, no updated needed! - -with open(f"failed_osti_ids_{str(datetime.datetime.now())}.txt", 'w') as output: # change filepath as needed + if res != None: + robocrys_description = res["description"] + + # what if there is no document in robocrys found? + else: + logging.warning(f"No robocrys document was found to match the OSTI record: {record.osti_id}!") + + # if the description of the record on Elink doesnt match what is in the robocrys collection: + if res != None and record.description != robocrys_description: + # directly update the description of the record via the record response + record.description = robocrys_description + + # and directly update the identifier for sponsoring org + for entry in record.organizations: + if entry.type == "SPONSOR": + entry.identifiers = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] + break + + try: + # send update to the record with the record response # update_record(osti_id, record, state="save") + # record_response = prod_api.update_record(record.osti_id, record, state="save") + update_success = True + + except: + logging.debug("The update failed to save!") + # add the osti_id of the failed update to failed_osti_ids + failed_osti_ids.append(record.osti_id) + + # if the update worked... + if update_success == True: + # save the record response returned with sending the update, done above + # convert that record response into a doi_model + doi_model = RecordResponse_to_doi_model(record) #change later to record response + + # upload that doi_model as a document to the new doi collection in mp_core + # what is the collection + with MongoClient() as local_client: + collection = local_client["dois_test"]["dois"] + x = collection.insert_one(doi_model.dict(by_alias=True)).inserted_id + + # else if the description on Elink matches what is in the robocrys collection: + elif record.description == robocrys_description: + # convert that record into a doi_model + doi_model = RecordResponse_to_doi_model(record) + + # upload that doi_model as a document to the new doi collection in mp_core, no updated needed! + with MongoClient() as local_client: + collection = local_client["dois_test"]["dois"] + x = collection.insert_one(doi_model).inserted_id + +cwd = os.getcwd() +path = f"/files/failed_osti_ids_{str(datetime.datetime.now())}.txt" +with open(cwd+path, 'w') as output: # change filepath as needed for id in failed_osti_ids: output.write(str(id) + '\n') # i'm pretty sure it's a string already though... \ No newline at end of file diff --git a/mpcite/doi_builder.py b/mpcite/doi_builder.py new file mode 100644 index 0000000..8ddd4c4 --- /dev/null +++ b/mpcite/doi_builder.py @@ -0,0 +1,186 @@ +''' +doi_builder.py +A doi collection must store the following information about a document: +- doi number +- title +- osti id (ELink's Unique Identifier) +- material id (MP's Unique Identifier) +- date of system entry date (Date (UTC) of this revision's inception) +- date of last update (date edited or date_submitted_to_osti_last) (take from ELink) +- workflow status and the date (?) of each step: + - SA, saved, in a holding state, not to be processed + - SR, submit to releasing official "released_to_osti_date, as entered by releasing official" + - SO, submit to OSTI + - SF, submitted but failed validation + - SX, submitted but failed to release + - SV, submitted and validated + - R, released +- + +Here is an example of RecordResponse +RecordResponse( + osti_id=2523296, + workflow_status='SA', + access_limitations=['UNL'], + access_limitation_other=None, + announcement_codes=None, + availability=None, + edition=None, + volume=None, + + # Identifiers + identifiers=[ + Identifier(type='CN_NONDOE', value='EDCBEE'), + Identifier(type='CN_DOE', value='AC02-05CH11231'), + Identifier(type='RN', value='mp-1037659'), + ], + + # People involved + persons=[ + Person( + type='CONTACT', + first_name='Kristin', + last_name='Persson', + phone='+1(510)486-7218', + email=['feedback@materialsproject.org'], + affiliations=[ + Affiliation(name='LBNL') + ] + ) + ], + + # Organizations + organizations=[ + Organization(name='The Materials Project', type='CONTRIBUTING', contributor_type='ResearchGroup'), + Organization(name='LBNL Materials Project', type='RESEARCHING'), + Organization(name='Lawrence Berkeley National Laboratory (LBNL), Berkeley, CA (United States)', type='RESEARCHING'), + Organization(name='USDOE Office of Science (SC), Basic Energy Sciences (BES) (SC-22)', type='SPONSOR'), + Organization(name='MIT', type='CONTRIBUTING', contributor_type='Other'), + Organization(name='UC Berkeley', type='CONTRIBUTING', contributor_type='Other'), + Organization(name='Duke', type='CONTRIBUTING', contributor_type='Other'), + Organization(name='U Louvain', type='CONTRIBUTING', contributor_type='Other'), + ], + + # Metadata + country_publication_code='US', + doe_supported_flag=False, + doi='10.17188/1714845', + edit_reason='Record updated upon request of LBNL-MP to remove authors and replace with a single collaborator.', + format_information='', + invention_disclosure_flag=None, + paper_flag=False, + peer_reviewed_flag=False, + product_type='DA', + publication_date=datetime.date(2020, 4, 30), + publication_date_text='04/30/2020', + site_url='https://materialsproject.org/materials/mp-1037659', + site_ownership_code='LBNL-MP', + site_unique_id='mp-1037659', + subject_category_code=['36'], + title='Materials Data on RbYMg30O32 by Materials Project', + + # Description + description=""" + RbMg₃₀YO₃₂ is Molybdenum Carbide MAX Phase-derived and crystallizes in the tetragonal P4/mmm space group. + Rb¹⁺ is bonded to six O²⁻ atoms to form RbO₆ octahedra... + (Truncated here for brevity, full description is included in original) + """, + + keywords=['crystal structure', 'RbYMg30O32', 'Mg-O-Rb-Y'], + languages=['English'], + related_doc_info='https://materialsproject.org/citing', + + # Media + media=[ + MediaInfo( + media_id=1908478, + osti_id=2523296, + status='C', + mime_type='text/html', + files=[ + MediaFile( + media_file_id=12017281, + media_type='O', + url='https://materialsproject.org/materials/mp-1037659' + ), + MediaFile( + media_file_id=12017284, + media_type='C', + mime_type='text/html', + media_source='OFF_SITE_DOWNLOAD' + ) + ] + ) + ], + + # Audit logs + audit_logs=[ + AuditLog( + messages=['Revision status is not correct, found SA'], + status='FAIL', + type='RELEASER', + audit_date=datetime.datetime(2025, 6, 30, 22, 30, 24, 865000, tzinfo=TzInfo(UTC)) + ) + ], + + # Timestamps + date_metadata_added=datetime.datetime(2025, 6, 30, 22, 30, 20, 495000, tzinfo=TzInfo(UTC)), + date_metadata_updated=datetime.datetime(2025, 6, 30, 22, 30, 22, 247000, tzinfo=TzInfo(UTC)), + + # Misc + revision=2, + added_by=139001, + edited_by=139001, + collection_type='DOE_LAB', + hidden_flag=False +) +''' + +from pydantic import BaseModel, ConfigDict +import datetime + +class doi_model(BaseModel): + # identifiers + doi: str # can be taken from ELink API + title: str # can be taken from ELink API + osti_id: str # can be taken from ELink API + material_id: str # can be taken from Robocrys Collection or ELink API + + # time stamps + date_record_entered_onto_ELink: datetime.datetime # can be taken from ELink API response + date_record_last_updated_on_Elink: datetime.datetime + + # status + elink_workflow_status: str # can be taken from ELink API + date_released: datetime.datetime + date_submitted_to_osti_first: datetime.datetime + date_submitted_to_osti_last: datetime.datetime + date_published: datetime.datetime # labelled as publication_date in RecordResponse of ELink API + +# hypothetically post an update or submit a new record and receive the RecordResponse +def RecordResponse_to_doi_model(recordresponse): + ''' + turns a recordresponse, which is returned from a save, submission, post, etc. into a doi_model object + ''' + params = { + "doi": recordresponse.doi, + "title": recordresponse.title, + "osti_id": str(recordresponse.osti_id), + "material_id": recordresponse.site_unique_id, + + "date_record_entered_onto_ELink": recordresponse.date_metadata_added, + "date_record_last_updated_on_Elink": recordresponse.date_metadata_updated, + + "elink_workflow_status": recordresponse.workflow_status, + "date_released": recordresponse.date_released, + # date_released_to_osti = recordresponse.released_to_osti_date, # what is the difference between these??? "Date record information was released to OSTI, as entered by releasing official." always seems to be none + "date_submitted_to_osti_first": recordresponse.date_submitted_to_osti_first, # date record was first submitted to OSTI for publication, maintained internally by E-Link + "date_submitted_to_osti_last": recordresponse.date_submitted_to_osti_last, # most recent date record information was submitted to OSTI. Maintained internally by E-Link. + "date_published": recordresponse.publication_date + } + + return doi_model(**params) + +def upload_doi_document_model_to_collection(doi_model, client, collection): + x = collection.insert_one(doi_model).inserted_id + return x \ No newline at end of file From 5466b4fc896b505b2080183dab808f5ef772020e Mon Sep 17 00:00:00 2001 From: Tyler Mathis <35553152+tsmathis@users.noreply.github.com> Date: Thu, 3 Jul 2025 14:05:10 -0700 Subject: [PATCH 09/26] find_out_of_date func --- legacy/mpcite/core.py | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/legacy/mpcite/core.py b/legacy/mpcite/core.py index 1159b66..88f1399 100644 --- a/legacy/mpcite/core.py +++ b/legacy/mpcite/core.py @@ -7,21 +7,44 @@ OstiID: TypeAlias = int -def find_out_of_date_records( +def find_out_of_date_doi_entries( client: MongoClient, robocrys_db: str, robocrys_collection: str, doi_db: str, - doi_collection, + doi_collection: str, ) -> list[OstiID]: - robocrys = client.robocrys_db.robocrys_collection - doi = client.doi_db.doi_collection - - out_of_data_osti_ids = [] - - # robocrys docs newer than in doi - - return out_of_data_osti_ids + robocrys = client[robocrys_db][robocrys_collection] + dois = client[doi_db][doi_collection] + + latest_doi = next( + dois.aggregate( + [ + {"$project": {"_id": 0, "date_record_last_updated_on_Elink": 1}}, + {"$sort": {"date_record_last_updated_on_Elink": -1}}, + {"$limit": 1}, + ] + ) + )["date_record_last_updated_on_Elink"] + + material_ids_to_update = list( + map( + lambda x: x["material_id"], + robocrys.find( + {"last_updated": {"$gt": latest_doi}}, {"_id": 0, "material_id": 1} + ), + ) + ) + + return list( + map( + lambda x: x["osti_id"], + dois.find( + {"material_id": {"$in": material_ids_to_update}}, + {"_id": 0, "osti_id": 1}, + ), + ), + ) def update_existing_osti_record(*args, **kwargs) -> RecordResponse: ... From f4325ba3e9313fa316d9377ada9ca4de544c53dc Mon Sep 17 00:00:00 2001 From: HugoOnghai Date: Tue, 8 Jul 2025 12:23:54 -0700 Subject: [PATCH 10/26] remaining core functions --- legacy/mpcite/core.py | 66 +++++++++++++++++++++++++++++++--- legacy/mpcite/pipeline.py | 2 +- mpcite/doi_builder.py | 17 ++++----- tests/file_to_jsonForUpload.py | 3 +- tests/github_bug_report.py | 31 ++++++++++++++++ 5 files changed, 103 insertions(+), 16 deletions(-) create mode 100644 tests/github_bug_report.py diff --git a/legacy/mpcite/core.py b/legacy/mpcite/core.py index 88f1399..188d80e 100644 --- a/legacy/mpcite/core.py +++ b/legacy/mpcite/core.py @@ -1,9 +1,12 @@ from typing import TypeAlias from elinkapi import Elink -from elinkapi.record import RecordResponse +from elinkapi.record import RecordResponse, Record from pymongo import MongoClient +import requests +from elinkapi.utils import Validation + OstiID: TypeAlias = int @@ -47,13 +50,66 @@ def find_out_of_date_doi_entries( ) -def update_existing_osti_record(*args, **kwargs) -> RecordResponse: ... +def update_existing_osti_record( + elinkapi: Elink, + osti_id: OstiID, + new_values: dict +) -> RecordResponse: + record_on_elink = elinkapi.get_single_record(osti_id) + + for keyword in new_values.keys(): + try: + setattr(record_on_elink, keyword, new_values[keyword]) + except ValueError: + print("Extraneous keywords found in the dictionary that do not correspond to attributes in the ELink API's record class.") + + # assume the use with fix the sponsor identifier bug before calling the update function + # # fix the issue with the sponsor organization's identifiers + # for entry in record_on_elink.organizations: + # if entry.type == "SPONSOR": + # entry.identifiers = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] + # break + + return elinkapi.update_record(osti_id, record_on_elink, state="save") # user should use update_state_of_osti_record to submit instead + + +def submit_new_osti_record( + elinkapi: Elink, + new_record: Record, + state = "submit", # assuming there is no need to both with saving. just send new record to osti when its ready for submission. also assume bug with DOE contract number identifier in sponsor organization is accounted for +) -> RecordResponse: + # template for all repeated stuff + # only submit + record_response = elinkapi.post_new_record(new_record, state) + + return record_response + +def update_state_of_osti_record( + elinkapi: Elink, + osti_id: OstiID, + new_state = "submit" +) -> RecordResponse: + record = elinkapi.get_single_record(osti_id) -def submit_new_osti_record(*args, **kwargs) -> RecordResponse: ... + # assuming that the user will handle the sponsor identifier bug before calling this function + # # fix the issue with the sponsor organization's identifiers + # for entry in record.organizations: + # if entry.type == "SPONSOR": + # entry.identifiers = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] + # break + return elinkapi.update_record(osti_id, record, new_state) -def update_state_of_osti_record(*args, **kwargs) -> RecordResponse: ... +def delete_osti_record( + elinkapi: Elink, + osti_id: OstiID, + reason: str +) -> RecordResponse: + """Delete a record by its OSTI ID.""" + response = requests.delete(f"{elinkapi.target}records/{osti_id}?reason={reason}", headers={"Authorization": f"Bearer {api.token}"}) + Validation.handle_response(response) + return response.status_code == 204 # True if deleted successfully -def delete_osti_record(*args, **kwargs) -> RecordResponse: ... +# TODO: make the github error thing for the weird issue with identifiers not being allocated to the sponsoring organization \ No newline at end of file diff --git a/legacy/mpcite/pipeline.py b/legacy/mpcite/pipeline.py index 869e65f..3d2c8c7 100644 --- a/legacy/mpcite/pipeline.py +++ b/legacy/mpcite/pipeline.py @@ -68,7 +68,7 @@ try: # send update to the record with the record response # update_record(osti_id, record, state="save") - # record_response = prod_api.update_record(record.osti_id, record, state="save") + record_response = prod_api.update_record(record.osti_id, record, state="save") update_success = True except: diff --git a/mpcite/doi_builder.py b/mpcite/doi_builder.py index 8ddd4c4..5e25c64 100644 --- a/mpcite/doi_builder.py +++ b/mpcite/doi_builder.py @@ -136,14 +136,15 @@ ) ''' -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, Field import datetime +# TODO: change the field names to match ELINK class doi_model(BaseModel): # identifiers - doi: str # can be taken from ELink API - title: str # can be taken from ELink API - osti_id: str # can be taken from ELink API + doi: str = Field(description="") # can be taken from ELink API + title: str = Field(description="") # can be taken from ELink API + osti_id: str = Field(description="") # can be taken from ELink API material_id: str # can be taken from Robocrys Collection or ELink API # time stamps @@ -152,10 +153,10 @@ class doi_model(BaseModel): # status elink_workflow_status: str # can be taken from ELink API - date_released: datetime.datetime - date_submitted_to_osti_first: datetime.datetime - date_submitted_to_osti_last: datetime.datetime - date_published: datetime.datetime # labelled as publication_date in RecordResponse of ELink API + date_released: datetime.datetime = Field(description="") + date_submitted_to_osti_first: datetime.datetime = Field(description="") + date_submitted_to_osti_last: datetime.datetime = Field(description="") + date_published: datetime.datetime = Field(description="") # labelled as publication_date in RecordResponse of ELink API # hypothetically post an update or submit a new record and receive the RecordResponse def RecordResponse_to_doi_model(recordresponse): diff --git a/tests/file_to_jsonForUpload.py b/tests/file_to_jsonForUpload.py index aaa0886..f633305 100644 --- a/tests/file_to_jsonForUpload.py +++ b/tests/file_to_jsonForUpload.py @@ -20,7 +20,6 @@ print(review_api.get_single_record(2525340)) -raise atlas_user = os.environ.get("atlas_user") atlas_password = os.environ.get("atlas_password") @@ -28,7 +27,7 @@ mongo_uri = f"mongodb+srv://{atlas_user}:{atlas_password}@{atlas_host}/" cwd = os.getcwd() -path = "/json_pages/page_number_4.0" # IT'S ONLY DOING ONE FILE RIGHT NOW LOL +path = "/json_pages/page_number_4.0" # IT'S ONLY DOING ONE FILE RIGHT NOW file = open(cwd + path, "r") update_counter = 0 diff --git a/tests/github_bug_report.py b/tests/github_bug_report.py new file mode 100644 index 0000000..c1eec4c --- /dev/null +++ b/tests/github_bug_report.py @@ -0,0 +1,31 @@ +from elinkapi import Elink +from elinkapi.record import Record +import os +from dotenv import load_dotenv + +load_dotenv() + +prod_api = Elink(token = os.environ.get("elink_api_PRODUCTION_key")) +review_endpoint = "https://review.osti.gov/elink2api/" +review_api = Elink(token = os.environ.get("elink_review_api_token"), target=review_endpoint) + +raise + +record_response = prod_api.get_single_record(1190959) # returns OSTI record response with OSTI ID = 1190959, which has a DOE Contract Number saved (AC02-05CH11231; EDCBEE) +record_response_dict = record_response.model_dump(exclude_none=True) +record_response_dict.pop("osti_id") # remove osti_id to allow post function + +new_record = Record(**record_response_dict) # identical record with removed OSTI_ID +for org in new_record.organizations: + if org.type == "SPONSOR": + print(org) + org.identifiers = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] + +# attempt to submit exact same record to review environment +record_response_after_post = review_api.post_new_record(new_record, "save") # works after re-providing the DOE contract number + +# next, attempt updating this record +record_to_update = review_api.get_single_record(record_response_after_post.osti_id) +record_to_update.title = "Updated Title For Materials Data" +review_api.update_record(record_response_after_post.osti_id, record_to_update, "submit") + From ce7651d1d1d954baf83459b91e51bf1eff101752 Mon Sep 17 00:00:00 2001 From: HugoOnghai Date: Thu, 10 Jul 2025 15:18:23 -0700 Subject: [PATCH 11/26] testing core functionalities on review environment and local mongo database --- legacy/mpcite/core.py | 37 +++++++++++---- mpcite/doi_builder.py | 34 +++++++------- mpcite/reset.py | 26 +++++++++++ mpcite/test_core.py | 84 ++++++++++++++++++++++++++++++++++ tests/file_to_jsonForUpload.py | 4 +- tests/github_bug_report.py | 62 ++++++++++++++++--------- 6 files changed, 197 insertions(+), 50 deletions(-) create mode 100644 mpcite/reset.py create mode 100644 mpcite/test_core.py diff --git a/legacy/mpcite/core.py b/legacy/mpcite/core.py index 188d80e..24be6b3 100644 --- a/legacy/mpcite/core.py +++ b/legacy/mpcite/core.py @@ -1,34 +1,37 @@ from typing import TypeAlias from elinkapi import Elink -from elinkapi.record import RecordResponse, Record +from elinkapi.record import RecordResponse, Record, Organization, Person from pymongo import MongoClient import requests from elinkapi.utils import Validation +from datetime import datetime + OstiID: TypeAlias = int def find_out_of_date_doi_entries( - client: MongoClient, + rc_client: MongoClient, + doi_client: MongoClient, robocrys_db: str, robocrys_collection: str, doi_db: str, doi_collection: str, ) -> list[OstiID]: - robocrys = client[robocrys_db][robocrys_collection] - dois = client[doi_db][doi_collection] + robocrys = rc_client[robocrys_db][robocrys_collection] + dois = doi_client[doi_db][doi_collection] latest_doi = next( dois.aggregate( [ - {"$project": {"_id": 0, "date_record_last_updated_on_Elink": 1}}, - {"$sort": {"date_record_last_updated_on_Elink": -1}}, + {"$project": {"_id": 0, "date_metadata_updated": 1}}, + {"$sort": {"date_metadata_updated": -1}}, {"$limit": 1}, ] ) - )["date_record_last_updated_on_Elink"] + )["date_metadata_updated"] material_ids_to_update = list( map( @@ -108,8 +111,24 @@ def delete_osti_record( reason: str ) -> RecordResponse: """Delete a record by its OSTI ID.""" - response = requests.delete(f"{elinkapi.target}records/{osti_id}?reason={reason}", headers={"Authorization": f"Bearer {api.token}"}) + response = requests.delete(f"{elinkapi.target}records/{osti_id}?reason={reason}", headers={"Authorization": f"Bearer {elinkapi.token}"}) Validation.handle_response(response) return response.status_code == 204 # True if deleted successfully -# TODO: make the github error thing for the weird issue with identifiers not being allocated to the sponsoring organization \ No newline at end of file +def emptyReviewAPI(reason, review_api): + allDeleted = True + for record in review_api.query_records(): + delete_osti_record(review_api, record.osti_id, reason) + +def make_minimum_record_to_fully_release( + title, # required to make record + product_type = "DA", # required to make record + organizations = [Organization(type='RESEARCHING', name='LBNL Materials Project (LBNL-MP)'), + Organization(type='SPONSOR', name='TEST SPONSOR ORG', identifiers=[{"type": 'CN_DOE', "value": 'AC02-05CH11231'}])], # sponsor org is necessary for submission + persons = [Person(type='AUTHOR', last_name='Perrson')], + site_ownership_code = "LBNL-MP", + access_limitations = ['UNL'], + publication_date = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0), # what should this be? + site_url = "https://next-gen.materialsproject.org/materials" +) -> Record: + return Record(product_type, title, persons, site_ownership_code, access_limitations, publication_date, site_url) \ No newline at end of file diff --git a/mpcite/doi_builder.py b/mpcite/doi_builder.py index 5e25c64..713724c 100644 --- a/mpcite/doi_builder.py +++ b/mpcite/doi_builder.py @@ -137,26 +137,26 @@ ''' from pydantic import BaseModel, ConfigDict, Field -import datetime +from datetime import datetime # TODO: change the field names to match ELINK class doi_model(BaseModel): # identifiers - doi: str = Field(description="") # can be taken from ELink API - title: str = Field(description="") # can be taken from ELink API - osti_id: str = Field(description="") # can be taken from ELink API + doi: str = Field(description="The DOI number as allocated by OSTI") # can be taken from ELink API + title: str = Field(description="The title of the record") # can be taken from ELink API + osti_id: str = Field(description="The OSTI ID number allocated by OSTI to make the DOI number") # can be taken from ELink API material_id: str # can be taken from Robocrys Collection or ELink API # time stamps - date_record_entered_onto_ELink: datetime.datetime # can be taken from ELink API response - date_record_last_updated_on_Elink: datetime.datetime + date_metadata_added: datetime | None = Field(description="date_record_entered_onto_ELink") # can be taken from ELink API response + date_metadata_updated: datetime | None = Field(description="date_record_last_updated_on_Elink") # status - elink_workflow_status: str # can be taken from ELink API - date_released: datetime.datetime = Field(description="") - date_submitted_to_osti_first: datetime.datetime = Field(description="") - date_submitted_to_osti_last: datetime.datetime = Field(description="") - date_published: datetime.datetime = Field(description="") # labelled as publication_date in RecordResponse of ELink API + workflow_status: str # can be taken from ELink API + date_released: datetime | None = Field(description="") + date_submitted_to_osti_first: datetime = Field(description="date record was first submitted to OSTI for publication, maintained internally by E-Link") + date_submitted_to_osti_last: datetime = Field(description="most recent date record information was submitted to OSTI. Maintained internally by E-Link") + publication_date: datetime | None = Field(description="") # labelled as publication_date in RecordResponse of ELink API # hypothetically post an update or submit a new record and receive the RecordResponse def RecordResponse_to_doi_model(recordresponse): @@ -169,19 +169,19 @@ def RecordResponse_to_doi_model(recordresponse): "osti_id": str(recordresponse.osti_id), "material_id": recordresponse.site_unique_id, - "date_record_entered_onto_ELink": recordresponse.date_metadata_added, - "date_record_last_updated_on_Elink": recordresponse.date_metadata_updated, + "date_metadata_added": recordresponse.date_metadata_added, + "date_metadata_updated": recordresponse.date_metadata_updated, - "elink_workflow_status": recordresponse.workflow_status, + "workflow_status": recordresponse.workflow_status, "date_released": recordresponse.date_released, # date_released_to_osti = recordresponse.released_to_osti_date, # what is the difference between these??? "Date record information was released to OSTI, as entered by releasing official." always seems to be none "date_submitted_to_osti_first": recordresponse.date_submitted_to_osti_first, # date record was first submitted to OSTI for publication, maintained internally by E-Link "date_submitted_to_osti_last": recordresponse.date_submitted_to_osti_last, # most recent date record information was submitted to OSTI. Maintained internally by E-Link. - "date_published": recordresponse.publication_date + "publication_date": recordresponse.publication_date } return doi_model(**params) -def upload_doi_document_model_to_collection(doi_model, client, collection): - x = collection.insert_one(doi_model).inserted_id +def upload_doi_document_model_to_collection(doi_model, collection): + x = collection.insert_one(doi_model.model_dump()).inserted_id return x \ No newline at end of file diff --git a/mpcite/reset.py b/mpcite/reset.py new file mode 100644 index 0000000..350b50c --- /dev/null +++ b/mpcite/reset.py @@ -0,0 +1,26 @@ +from mpcite.core import * +from mpcite.doi_builder import RecordResponse_to_doi_model, upload_doi_document_model_to_collection +import os +import json +from dotenv import load_dotenv + +load_dotenv() # depends on the root directory from which you run your python scripts. + +review_endpoint = "https://review.osti.gov/elink2api/" + +prod_api = Elink(token = os.environ.get("elink_api_PRODUCTION_key")) +review_api = Elink(token = os.environ.get("elink_review_api_token"), target=review_endpoint) + +cwd = os.getcwd() +path = "/json_pages/page_number_1000.0" # IT'S ONLY DOING ONE FILE RIGHT NOW +file = open(cwd + path, "r") + +atlas_user = os.environ.get("atlas_user") +atlas_password = os.environ.get("atlas_password") +atlas_host = os.environ.get("atlas_host") +mongo_uri = f"mongodb+srv://{atlas_user}:{atlas_password}@{atlas_host}/" + +# emptyReviewAPI("Testing", review_api) + +with MongoClient() as client: + client.dois_test.dois.delete_many({}, comment="Testing") \ No newline at end of file diff --git a/mpcite/test_core.py b/mpcite/test_core.py new file mode 100644 index 0000000..948d83c --- /dev/null +++ b/mpcite/test_core.py @@ -0,0 +1,84 @@ +from mpcite.core import * +from mpcite.doi_builder import RecordResponse_to_doi_model, upload_doi_document_model_to_collection +import os +import json +from dotenv import load_dotenv + +load_dotenv() # depends on the root directory from which you run your python scripts. + +review_endpoint = "https://review.osti.gov/elink2api/" + +prod_api = Elink(token = os.environ.get("elink_api_PRODUCTION_key")) +review_api = Elink(token = os.environ.get("elink_review_api_token"), target=review_endpoint) + +cwd = os.getcwd() +path = "/json_pages/page_number_1000.0" # IT'S ONLY DOING ONE FILE RIGHT NOW +file = open(cwd + path, "r") + +atlas_user = os.environ.get("atlas_user") +atlas_password = os.environ.get("atlas_password") +atlas_host = os.environ.get("atlas_host") +mongo_uri = f"mongodb+srv://{atlas_user}:{atlas_password}@{atlas_host}/" + +with MongoClient(mongo_uri) as real_client: + with MongoClient() as doi_client: # open the mongoclient outside of the for loop, is more efficient than opening and closing it repeatedly + dois = doi_client["dois_test"]["dois"] + + # for line in file: + # js = json.loads(line.strip()) + + # # temporarily fix the sponsor organization bug + # for entry in js["organizations"]: + # if entry["type"] == "SPONSOR": + # entry["identifiers"] = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] + + # my_record = Record(**js) + + # # make a post to the elink review environment + # saved_record = review_api.post_new_record(my_record, state="submit") + + # # make a doi document with saved_record + # doi_model = RecordResponse_to_doi_model(saved_record) + + # # now, add that doi to the local doi collection + # upload_doi_document_model_to_collection(doi_model, dois) + + # all_material_ids = [doc["material_id"] for doc in dois.find({}, {"_id": 0, "material_id": 1})] + + # for material_id in all_material_ids: + + # # query prod env for record with materials_id == site_unique_id + # record_from_prod = prod_api.query_records(site_unique_id=material_id) + + # if record_from_prod.total_rows != 1: + # print(f"ERROR: not unique Material_ID! {material_id}") + # raise + + # # make a doi_model from that data + # recordresponse_from_prod = RecordResponse_to_doi_model(record_from_prod.data[0]) + + # query_filter = {"material_id": material_id} + + # # Find existing document to preserve the osti_id + # existing_doc = dois.find_one(query_filter, {"osti_id": 1}) # only retrieve osti_id + + # if not existing_doc: + # print(f"ERROR: document with material_id {material_id} not found in `dois` collection.") + # raise + + # replacement_doc = recordresponse_from_prod.model_dump() + # replacement_doc["osti_id"] = existing_doc["osti_id"] + + # dois.replace_one(query_filter, replacement_doc) + + osti_OOD_list = find_out_of_date_doi_entries(real_client, doi_client, "mp_core_blue", "robocrys", "dois_test", "dois") + print(osti_OOD_list) + + for osti_id in osti_OOD_list: + material_id_to_update = review_api.get_single_record(osti_id).site_unique_id + + new_values = { + "description": "UPDATED ROBOCRYS DESCRIPTION: " + next(real_client["mp_core_blue"]["robocrys"].find({"material_id": material_id_to_update}, {"_id": 0, "description": 1}))["description"] + } + + update_existing_osti_record(review_api, osti_id, new_values) \ No newline at end of file diff --git a/tests/file_to_jsonForUpload.py b/tests/file_to_jsonForUpload.py index f633305..aa864e0 100644 --- a/tests/file_to_jsonForUpload.py +++ b/tests/file_to_jsonForUpload.py @@ -19,8 +19,6 @@ review_api = Elink(token = os.environ.get("elink_review_api_token"), target=review_endpoint) -print(review_api.get_single_record(2525340)) - atlas_user = os.environ.get("atlas_user") atlas_password = os.environ.get("atlas_password") atlas_host = os.environ.get("atlas_host") @@ -44,6 +42,8 @@ def emptyReviewAPI(reason): for record in review_api.query_records(): delete_record(review_api, record.osti_id, reason) +raise + start = timer() # Post an updated json diff --git a/tests/github_bug_report.py b/tests/github_bug_report.py index c1eec4c..3151e6d 100644 --- a/tests/github_bug_report.py +++ b/tests/github_bug_report.py @@ -1,7 +1,7 @@ -from elinkapi import Elink -from elinkapi.record import Record +from elinkapi import Elink, Organization, Person, exceptions, Record import os from dotenv import load_dotenv +from datetime import datetime load_dotenv() @@ -9,23 +9,41 @@ review_endpoint = "https://review.osti.gov/elink2api/" review_api = Elink(token = os.environ.get("elink_review_api_token"), target=review_endpoint) -raise - -record_response = prod_api.get_single_record(1190959) # returns OSTI record response with OSTI ID = 1190959, which has a DOE Contract Number saved (AC02-05CH11231; EDCBEE) -record_response_dict = record_response.model_dump(exclude_none=True) -record_response_dict.pop("osti_id") # remove osti_id to allow post function - -new_record = Record(**record_response_dict) # identical record with removed OSTI_ID -for org in new_record.organizations: - if org.type == "SPONSOR": - print(org) - org.identifiers = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] - -# attempt to submit exact same record to review environment -record_response_after_post = review_api.post_new_record(new_record, "save") # works after re-providing the DOE contract number - -# next, attempt updating this record -record_to_update = review_api.get_single_record(record_response_after_post.osti_id) -record_to_update.title = "Updated Title For Materials Data" -review_api.update_record(record_response_after_post.osti_id, record_to_update, "submit") - +# record_response = prod_api.get_single_record(1190959) # returns OSTI record response with OSTI ID = 1190959, which has a DOE Contract Number saved (AC02-05CH11231; EDCBEE) +# record_response_dict = record_response.model_dump(exclude_none=True) +# record_response_dict.pop("osti_id") # remove osti_id to allow post function + +# new_record = Record(**record_response_dict) # identical record with removed OSTI_ID +# for org in new_record.organizations: +# if org.type == "SPONSOR": +# print(org) +# org.identifiers = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] + +# # attempt to submit exact same record to review environment +# record_response_after_post = review_api.post_new_record(new_record, "save") # works after re-providing the DOE contract number + +# # next, attempt updating this record +# record_to_update = review_api.get_single_record(record_response_after_post.osti_id) +# record_to_update.title = "Updated Title For Materials Data" +# review_api.update_record(record_response_after_post.osti_id, record_to_update, "submit") + +required_fields = { + "product_type": "DA", + "title": "Testing if CN_DOE can be random", + "organizations": [Organization(type='RESEARCHING', name='LBNL Materials Project (LBNL-MP)'), + Organization(type='SPONSOR', name='TEST SPONSOR ORG', identifiers=[{"type": 'CN_DOE', "value": 'oiajdiwjdiwj'}])], + "persons": [Person(type='AUTHOR', last_name='Schmoe')], + "site_ownership_code": "LBNL-MP", + "access_limitations": ['UNL'], + "publication_date": datetime.now().replace(hour=0, minute=0, second=0, microsecond=0), + "site_url": "https://next-gen.materialsproject.org/materials" +} + +empty_record = Record(**required_fields) +print(f"SUBMITTED TO OSTI, FULLY VALIDATED:\n{review_api.get_single_record(2525614)}\n\n\nTRYING TO SUBMIT:\n{empty_record}") + +try: + saved_record = review_api.post_new_record(empty_record, "submit") +except exceptions.BadRequestException as ve: + print(ve.message) + print(ve.errors) \ No newline at end of file From 4905bcb9d54cc62158f50cef8275655dee480edb Mon Sep 17 00:00:00 2001 From: HugoOnghai <99376417+HugoOnghai@users.noreply.github.com> Date: Fri, 11 Jul 2025 17:01:29 -0700 Subject: [PATCH 12/26] (after rebase) Merged upstream (#1) * move old code to 'legacy' * setup project using uv * add license * testing skeleton * gh actions skeleton * remove old reqs file to prevent dependabot alerts --------- Co-authored-by: Tyler Mathis <35553152+tsmathis@users.noreply.github.com> --- .github/workflows/testing.yml | 5 + .gitignore | 15 ++- src/mp_cite/core.py | 134 +++++++++++++++++++++++++ {mpcite => src/mp_cite}/doi_builder.py | 0 src/mp_cite/models.py | 109 ++++++++++++++++++++ src/mp_cite/pipeline.py | 105 +++++++++++++++++++ {mpcite => src/mp_cite}/reset.py | 0 src/mp_cite/send_collection.py | 79 +++++++++++++++ {mpcite => src/mp_cite}/test_core.py | 0 9 files changed, 442 insertions(+), 5 deletions(-) create mode 100644 src/mp_cite/core.py rename {mpcite => src/mp_cite}/doi_builder.py (100%) create mode 100644 src/mp_cite/models.py create mode 100644 src/mp_cite/pipeline.py rename {mpcite => src/mp_cite}/reset.py (100%) create mode 100644 src/mp_cite/send_collection.py rename {mpcite => src/mp_cite}/test_core.py (100%) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 67e0f21..9f71a9e 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -17,6 +17,11 @@ jobs: runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 +<<<<<<< HEAD +======= + with: + fetch-depth: 0 +>>>>>>> 52382ff (Merged upstream (#1)) - name: Install uv uses: astral-sh/setup-uv@v6 diff --git a/.gitignore b/.gitignore index 128afd8..72abf6e 100644 --- a/.gitignore +++ b/.gitignore @@ -78,6 +78,7 @@ target/ # Jupyter Notebook .ipynb_checkpoints +<<<<<<< HEAD # IPython profile_default/ @@ -86,7 +87,11 @@ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: +<<<<<<< HEAD .python-version +======= +# .python-version +>>>>>>> 52382ff (Merged upstream (#1)) # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. @@ -209,10 +214,10 @@ __marimo__/ # Streamlit .streamlit/secrets.toml + +# json files for storing production records *.json .env ->>>>>>> 7bb812e (preliminary version of ELinkGetResponseModel made, using ELinkAPI RecordResponse) - -json_pages/ -notebooks/ -test_json_pages/ \ No newline at end of file +/json_pages +/notebooks +/test_json_pages diff --git a/src/mp_cite/core.py b/src/mp_cite/core.py new file mode 100644 index 0000000..24be6b3 --- /dev/null +++ b/src/mp_cite/core.py @@ -0,0 +1,134 @@ +from typing import TypeAlias + +from elinkapi import Elink +from elinkapi.record import RecordResponse, Record, Organization, Person +from pymongo import MongoClient + +import requests +from elinkapi.utils import Validation + +from datetime import datetime + +OstiID: TypeAlias = int + + +def find_out_of_date_doi_entries( + rc_client: MongoClient, + doi_client: MongoClient, + robocrys_db: str, + robocrys_collection: str, + doi_db: str, + doi_collection: str, +) -> list[OstiID]: + robocrys = rc_client[robocrys_db][robocrys_collection] + dois = doi_client[doi_db][doi_collection] + + latest_doi = next( + dois.aggregate( + [ + {"$project": {"_id": 0, "date_metadata_updated": 1}}, + {"$sort": {"date_metadata_updated": -1}}, + {"$limit": 1}, + ] + ) + )["date_metadata_updated"] + + material_ids_to_update = list( + map( + lambda x: x["material_id"], + robocrys.find( + {"last_updated": {"$gt": latest_doi}}, {"_id": 0, "material_id": 1} + ), + ) + ) + + return list( + map( + lambda x: x["osti_id"], + dois.find( + {"material_id": {"$in": material_ids_to_update}}, + {"_id": 0, "osti_id": 1}, + ), + ), + ) + + +def update_existing_osti_record( + elinkapi: Elink, + osti_id: OstiID, + new_values: dict +) -> RecordResponse: + record_on_elink = elinkapi.get_single_record(osti_id) + + for keyword in new_values.keys(): + try: + setattr(record_on_elink, keyword, new_values[keyword]) + except ValueError: + print("Extraneous keywords found in the dictionary that do not correspond to attributes in the ELink API's record class.") + + # assume the use with fix the sponsor identifier bug before calling the update function + # # fix the issue with the sponsor organization's identifiers + # for entry in record_on_elink.organizations: + # if entry.type == "SPONSOR": + # entry.identifiers = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] + # break + + return elinkapi.update_record(osti_id, record_on_elink, state="save") # user should use update_state_of_osti_record to submit instead + + +def submit_new_osti_record( + elinkapi: Elink, + new_record: Record, + state = "submit", # assuming there is no need to both with saving. just send new record to osti when its ready for submission. also assume bug with DOE contract number identifier in sponsor organization is accounted for +) -> RecordResponse: + # template for all repeated stuff + # only submit + record_response = elinkapi.post_new_record(new_record, state) + + return record_response + + +def update_state_of_osti_record( + elinkapi: Elink, + osti_id: OstiID, + new_state = "submit" +) -> RecordResponse: + record = elinkapi.get_single_record(osti_id) + + # assuming that the user will handle the sponsor identifier bug before calling this function + # # fix the issue with the sponsor organization's identifiers + # for entry in record.organizations: + # if entry.type == "SPONSOR": + # entry.identifiers = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] + # break + + return elinkapi.update_record(osti_id, record, new_state) + + +def delete_osti_record( + elinkapi: Elink, + osti_id: OstiID, + reason: str +) -> RecordResponse: + """Delete a record by its OSTI ID.""" + response = requests.delete(f"{elinkapi.target}records/{osti_id}?reason={reason}", headers={"Authorization": f"Bearer {elinkapi.token}"}) + Validation.handle_response(response) + return response.status_code == 204 # True if deleted successfully + +def emptyReviewAPI(reason, review_api): + allDeleted = True + for record in review_api.query_records(): + delete_osti_record(review_api, record.osti_id, reason) + +def make_minimum_record_to_fully_release( + title, # required to make record + product_type = "DA", # required to make record + organizations = [Organization(type='RESEARCHING', name='LBNL Materials Project (LBNL-MP)'), + Organization(type='SPONSOR', name='TEST SPONSOR ORG', identifiers=[{"type": 'CN_DOE', "value": 'AC02-05CH11231'}])], # sponsor org is necessary for submission + persons = [Person(type='AUTHOR', last_name='Perrson')], + site_ownership_code = "LBNL-MP", + access_limitations = ['UNL'], + publication_date = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0), # what should this be? + site_url = "https://next-gen.materialsproject.org/materials" +) -> Record: + return Record(product_type, title, persons, site_ownership_code, access_limitations, publication_date, site_url) \ No newline at end of file diff --git a/mpcite/doi_builder.py b/src/mp_cite/doi_builder.py similarity index 100% rename from mpcite/doi_builder.py rename to src/mp_cite/doi_builder.py diff --git a/src/mp_cite/models.py b/src/mp_cite/models.py new file mode 100644 index 0000000..e4d055f --- /dev/null +++ b/src/mp_cite/models.py @@ -0,0 +1,109 @@ +from pydantic import BaseModel, Field, ConfigDict +from typing import List, Dict, Optional +import datetime +from enum import Enum +import bibtexparser +from elinkapi import Elink, Record +from elinkapi.record import RecordResponse, AccessLimitation, JournalType +from elinkapi.geolocation import Geolocation +from elinkapi.identifier import Identifier +from elinkapi.related_identifier import RelatedIdentifier +from elinkapi.person import Person +from elinkapi.organization import Organization + +class TestClass(RecordResponse): + ... + # stuff + +class ELinkGetResponseModel(BaseModel): + osti_id: Optional[int] = Field(...) + dataset_type: str = Field(default="SM") + title: str = Field(...) + persons: List[Person] + contributors: List[Dict[str, str]] = Field( + default=[{"first_name": "Materials", "last_name": "Project"}], + description="List of Dict of first name, last name mapping", + ) # no contributor + publication_date: datetime.date + site_url: str = Field(...) + doi: dict = Field( + {}, title="DOI info", description="Mainly used during GET request" + ) + mp_id: str | None = None + keywords: List[str] = None + + @classmethod + def from_elinkapi_record(cls, R): + gotResponse = ELinkGetResponseModel( + osti_id = R.osti_id, + title = R.title, + persons = R.persons, + # assume default contributors for now, creators vs contributors? + publication_date = R.publication_date, + site_url = R.site_url, + doi = {"doi": R.doi}, + mp_id = next((id.value for id in R.identifiers if id.type == 'RN'), None), + keywords = R.keywords + ) + + return gotResponse + + def get_title(self): + formula = self.keywords[1] + return "Materials Data on %s by Materials Project" % formula + + def get_site_url(self): + return "https://materialsproject.org/materials/%s" % self.mp_id + + def get_keywords(self): + # keywords = "; ".join( + # ["crystal structure", material.pretty_formula, material.chemsys] + # ) + return self.keywords + + @classmethod + def get_default_description(cls): + return ( + "Computed materials data using density " + "functional theory calculations. These calculations determine " + "the electronic structure of bulk materials by solving " + "approximations to the Schrodinger equation. For more " + "information, see https://materialsproject.org/docs/calculations" + ) + + @classmethod + def custom_to_dict(cls, elink_record) -> dict: + if elink_record.osti_id is None or elink_record.osti_id == "": + return elink_record.dict(exclude={"osti_id", "doi"}) + else: + return elink_record.dict(exclude={"doi"}) + + +class ElinkResponseStatusEnum(Enum): + SUCCESS = "SUCCESS" + FAILED = "FAILURE" + + +class ELinkPostResponseModel(BaseModel): + osti_id: str + accession_num: str + product_nos: str + title: str + contract_nos: str + other_identifying_nos: Optional[str] + doi: Dict[str, str] + status: ElinkResponseStatusEnum + status_message: Optional[str] + + def generate_doi_record(self): + doi_collection_record = DOIRecordModel( + material_id=self.accession_num, + doi=self.doi["#text"], + status=self.doi["@status"], + bibtex=None, + valid=True, + last_validated_on=datetime.now(), + ) + doi_collection_record.set_status(status=self.doi["@status"]) + doi_collection_record.last_validated_on = datetime.now() + return doi_collection_record \ No newline at end of file diff --git a/src/mp_cite/pipeline.py b/src/mp_cite/pipeline.py new file mode 100644 index 0000000..3d2c8c7 --- /dev/null +++ b/src/mp_cite/pipeline.py @@ -0,0 +1,105 @@ +import os +import json +from elinkapi import Elink, Record +from elinkapi.record import RecordResponse +from dotenv import load_dotenv + +import requests +from elinkapi.utils import Validation + +from pymongo import MongoClient +import pymongo + +from timeit import default_timer as timer +import logging +import datetime +from doi_builder import * + +load_dotenv() # depends on the root directory from which you run your python scripts. + +review_endpoint = "https://review.osti.gov/elink2api/" + +prod_api = Elink(token = os.environ.get("elink_api_PRODUCTION_key")) +review_api = Elink(token = os.environ.get("elink_review_api_token"), target=review_endpoint) + +atlas_user = os.environ.get("atlas_user") +atlas_password = os.environ.get("atlas_password") +atlas_host = os.environ.get("atlas_host") +mongo_uri = f"mongodb+srv://{atlas_user}:{atlas_password}@{atlas_host}/" + +failed_osti_ids = [] + +cwd = os.getcwd() +path = "/json_pages/" + +for filename in os.listdir(cwd+path): + logging.debug(f"Now extracting {filename}") + file = open(cwd + path + filename, "r") + for line in file: + record = RecordResponse(**json.loads(line.strip())) + record.osti_id = record.doi.split('/')[1] + # for every record in the OSTI production environment: + # flag for update performance + update_success = False + + material_id = record.site_unique_id + + with MongoClient(mongo_uri) as client: # should I open this in or outside of the for loop? + coll = client["mp_core_blue"]["robocrys"] + res = coll.find_one({"material_id" : material_id}) + + if res != None: + robocrys_description = res["description"] + + # what if there is no document in robocrys found? + else: + logging.warning(f"No robocrys document was found to match the OSTI record: {record.osti_id}!") + + # if the description of the record on Elink doesnt match what is in the robocrys collection: + if res != None and record.description != robocrys_description: + # directly update the description of the record via the record response + record.description = robocrys_description + + # and directly update the identifier for sponsoring org + for entry in record.organizations: + if entry.type == "SPONSOR": + entry.identifiers = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] + break + + try: + # send update to the record with the record response # update_record(osti_id, record, state="save") + record_response = prod_api.update_record(record.osti_id, record, state="save") + update_success = True + + except: + logging.debug("The update failed to save!") + # add the osti_id of the failed update to failed_osti_ids + failed_osti_ids.append(record.osti_id) + + # if the update worked... + if update_success == True: + # save the record response returned with sending the update, done above + # convert that record response into a doi_model + doi_model = RecordResponse_to_doi_model(record) #change later to record response + + # upload that doi_model as a document to the new doi collection in mp_core + # what is the collection + with MongoClient() as local_client: + collection = local_client["dois_test"]["dois"] + x = collection.insert_one(doi_model.dict(by_alias=True)).inserted_id + + # else if the description on Elink matches what is in the robocrys collection: + elif record.description == robocrys_description: + # convert that record into a doi_model + doi_model = RecordResponse_to_doi_model(record) + + # upload that doi_model as a document to the new doi collection in mp_core, no updated needed! + with MongoClient() as local_client: + collection = local_client["dois_test"]["dois"] + x = collection.insert_one(doi_model).inserted_id + +cwd = os.getcwd() +path = f"/files/failed_osti_ids_{str(datetime.datetime.now())}.txt" +with open(cwd+path, 'w') as output: # change filepath as needed + for id in failed_osti_ids: + output.write(str(id) + '\n') # i'm pretty sure it's a string already though... \ No newline at end of file diff --git a/mpcite/reset.py b/src/mp_cite/reset.py similarity index 100% rename from mpcite/reset.py rename to src/mp_cite/reset.py diff --git a/src/mp_cite/send_collection.py b/src/mp_cite/send_collection.py new file mode 100644 index 0000000..0ce65a3 --- /dev/null +++ b/src/mp_cite/send_collection.py @@ -0,0 +1,79 @@ +from pathlib import Path +from xml.dom.minidom import parseString +from dicttoxml import dicttoxml +from mpcite.doi_builder import DOIBuilder +import json +from monty.json import MontyDecoder +from pydantic import BaseModel, Field +from typing import List + +default_description = ( + "Computed materials data using density functional theory calculations. These " + "calculations determine the electronic structure of bulk materials by solving " + "approximations to the Schrodinger equation. For more information, " + "see https://materialsproject.org/docs/calculations" +) + + +class CollectionsModel(BaseModel): + title: str = Field(default="Sample Title") + product_type: str = Field(default="DC") + relidentifiersblock: List[List[str]] = Field() + contributors: List[dict] + description: str = Field(default=default_description) + site_url: str = Field(default="https://materialsproject.org/") + + +config_file = Path("/Users/michaelwu/Desktop/projects/MPCite/files/config_prod.json") + +bld: DOIBuilder = json.load(config_file.open("r"), cls=MontyDecoder) +bld.config_file_path = config_file.as_posix() + +records = [ + CollectionsModel( + relidentifiersblock=[["mp-1", "mp-2", "mp-1"]], + contributors=[ + { + "first_name": "Michael", + "last_name": "Wu", + "email": "wuxiaohua1011@berkeley.edu", + } + ], + ).dict(), + CollectionsModel( + relidentifiersblock=[["mp-21"], ["mp-22"]], + contributors=[ + { + "first_name": "Michael", + "last_name": "Wu", + "email": "wuxiaohua1011@berkeley.edu", + } + ], + ).dict(), +] + + +def my_item_func(x): + if x == "records": + return "record" + elif x == "contributors": + return "contributor" + elif x == "relidentifier_detail": + return "related_identifier" + elif x == "relidentifiersblock": + return "relidentifier_detail" + else: + return "item" + + +records_xml = parseString( + dicttoxml(records, custom_root="records", attr_type=False, item_func=my_item_func) +) + +for item in records_xml.getElementsByTagName("relidentifier_detail"): + item.setAttribute("type", "accession_num") + item.setAttribute("relationType", "Compiles") + +print(records_xml.toprettyxml()) +# response = bld.elink_adapter.post_collection(data=records_xml.toxml()) +# print(response) diff --git a/mpcite/test_core.py b/src/mp_cite/test_core.py similarity index 100% rename from mpcite/test_core.py rename to src/mp_cite/test_core.py From c10e9b741a026b2fd6d9078d2203a71c6a7ca086 Mon Sep 17 00:00:00 2001 From: HugoOnghai Date: Wed, 16 Jul 2025 10:47:12 -0700 Subject: [PATCH 13/26] (after rebase) updated gitignore and moved linting to new branch --- .gitignore | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 85 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 72abf6e..14228f8 100644 --- a/.gitignore +++ b/.gitignore @@ -56,6 +56,65 @@ cover/ *.mo *.pot +# Django stuff: +# https://github.com/github/gitignore/blob/main/Python.gitignore +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[codz] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py.cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + # Django stuff: *.log local_settings.py @@ -76,9 +135,31 @@ docs/_build/ .pybuilder/ target/ +# Jupyter Notebook +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + # Jupyter Notebook .ipynb_checkpoints <<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> 9359e8d ((after rebase) updated gitignore and moved linting to new branch) # IPython profile_default/ @@ -87,11 +168,7 @@ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: -<<<<<<< HEAD -.python-version -======= # .python-version ->>>>>>> 52382ff (Merged upstream (#1)) # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. @@ -218,6 +295,10 @@ __marimo__/ # json files for storing production records *.json .env +<<<<<<< HEAD /json_pages /notebooks /test_json_pages +======= +/json_pages +>>>>>>> 9359e8d ((after rebase) updated gitignore and moved linting to new branch) From 9e89ad5cdd6055cf5198b8bbb18fff105799ebc0 Mon Sep 17 00:00:00 2001 From: HugoOnghai Date: Wed, 30 Jul 2025 14:29:24 -0700 Subject: [PATCH 14/26] Remove accidentally committed files --- .gitignore | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.gitignore b/.gitignore index 14228f8..b4acfca 100644 --- a/.gitignore +++ b/.gitignore @@ -296,9 +296,16 @@ __marimo__/ *.json .env <<<<<<< HEAD +<<<<<<< HEAD /json_pages /notebooks /test_json_pages ======= /json_pages >>>>>>> 9359e8d ((after rebase) updated gitignore and moved linting to new branch) +======= +/json_pages + +/test_json_pages +/notebooks +>>>>>>> 5830467 (Remove accidentally committed files) From f5a1cf48e388ed698986e9831b0dd0bafc5c2cfa Mon Sep 17 00:00:00 2001 From: HugoOnghai Date: Mon, 4 Aug 2025 11:15:46 -0700 Subject: [PATCH 15/26] Cleaning up new-core PR, restore workflows and legacy stuff --- .github/workflows/release.yml | 4 ++++ .github/workflows/testing.yml | 5 ----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e5d2a28..41c038d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -66,7 +66,11 @@ jobs: uses: ad-m/github-push-action@master with: github_token: ${{ secrets.GITHUB_TOKEN }} +<<<<<<< HEAD branch: master - name: Build and Deploy! run: uvx mkdocs gh-deploy +======= +# integrate mkdocs at some point +>>>>>>> a9fa948 (Cleaning up new-core PR, restore workflows and legacy stuff) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 9f71a9e..67e0f21 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -17,11 +17,6 @@ jobs: runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 -<<<<<<< HEAD -======= - with: - fetch-depth: 0 ->>>>>>> 52382ff (Merged upstream (#1)) - name: Install uv uses: astral-sh/setup-uv@v6 From 503696a35dedbbf1b049d9713b43e98fe2af1a8e Mon Sep 17 00:00:00 2001 From: HugoOnghai Date: Mon, 4 Aug 2025 11:28:55 -0700 Subject: [PATCH 16/26] Clean up legacy for PR --- legacy/mpcite/core.py | 134 -------------------------------------- legacy/mpcite/pipeline.py | 105 ----------------------------- 2 files changed, 239 deletions(-) delete mode 100644 legacy/mpcite/core.py delete mode 100644 legacy/mpcite/pipeline.py diff --git a/legacy/mpcite/core.py b/legacy/mpcite/core.py deleted file mode 100644 index 24be6b3..0000000 --- a/legacy/mpcite/core.py +++ /dev/null @@ -1,134 +0,0 @@ -from typing import TypeAlias - -from elinkapi import Elink -from elinkapi.record import RecordResponse, Record, Organization, Person -from pymongo import MongoClient - -import requests -from elinkapi.utils import Validation - -from datetime import datetime - -OstiID: TypeAlias = int - - -def find_out_of_date_doi_entries( - rc_client: MongoClient, - doi_client: MongoClient, - robocrys_db: str, - robocrys_collection: str, - doi_db: str, - doi_collection: str, -) -> list[OstiID]: - robocrys = rc_client[robocrys_db][robocrys_collection] - dois = doi_client[doi_db][doi_collection] - - latest_doi = next( - dois.aggregate( - [ - {"$project": {"_id": 0, "date_metadata_updated": 1}}, - {"$sort": {"date_metadata_updated": -1}}, - {"$limit": 1}, - ] - ) - )["date_metadata_updated"] - - material_ids_to_update = list( - map( - lambda x: x["material_id"], - robocrys.find( - {"last_updated": {"$gt": latest_doi}}, {"_id": 0, "material_id": 1} - ), - ) - ) - - return list( - map( - lambda x: x["osti_id"], - dois.find( - {"material_id": {"$in": material_ids_to_update}}, - {"_id": 0, "osti_id": 1}, - ), - ), - ) - - -def update_existing_osti_record( - elinkapi: Elink, - osti_id: OstiID, - new_values: dict -) -> RecordResponse: - record_on_elink = elinkapi.get_single_record(osti_id) - - for keyword in new_values.keys(): - try: - setattr(record_on_elink, keyword, new_values[keyword]) - except ValueError: - print("Extraneous keywords found in the dictionary that do not correspond to attributes in the ELink API's record class.") - - # assume the use with fix the sponsor identifier bug before calling the update function - # # fix the issue with the sponsor organization's identifiers - # for entry in record_on_elink.organizations: - # if entry.type == "SPONSOR": - # entry.identifiers = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] - # break - - return elinkapi.update_record(osti_id, record_on_elink, state="save") # user should use update_state_of_osti_record to submit instead - - -def submit_new_osti_record( - elinkapi: Elink, - new_record: Record, - state = "submit", # assuming there is no need to both with saving. just send new record to osti when its ready for submission. also assume bug with DOE contract number identifier in sponsor organization is accounted for -) -> RecordResponse: - # template for all repeated stuff - # only submit - record_response = elinkapi.post_new_record(new_record, state) - - return record_response - - -def update_state_of_osti_record( - elinkapi: Elink, - osti_id: OstiID, - new_state = "submit" -) -> RecordResponse: - record = elinkapi.get_single_record(osti_id) - - # assuming that the user will handle the sponsor identifier bug before calling this function - # # fix the issue with the sponsor organization's identifiers - # for entry in record.organizations: - # if entry.type == "SPONSOR": - # entry.identifiers = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] - # break - - return elinkapi.update_record(osti_id, record, new_state) - - -def delete_osti_record( - elinkapi: Elink, - osti_id: OstiID, - reason: str -) -> RecordResponse: - """Delete a record by its OSTI ID.""" - response = requests.delete(f"{elinkapi.target}records/{osti_id}?reason={reason}", headers={"Authorization": f"Bearer {elinkapi.token}"}) - Validation.handle_response(response) - return response.status_code == 204 # True if deleted successfully - -def emptyReviewAPI(reason, review_api): - allDeleted = True - for record in review_api.query_records(): - delete_osti_record(review_api, record.osti_id, reason) - -def make_minimum_record_to_fully_release( - title, # required to make record - product_type = "DA", # required to make record - organizations = [Organization(type='RESEARCHING', name='LBNL Materials Project (LBNL-MP)'), - Organization(type='SPONSOR', name='TEST SPONSOR ORG', identifiers=[{"type": 'CN_DOE', "value": 'AC02-05CH11231'}])], # sponsor org is necessary for submission - persons = [Person(type='AUTHOR', last_name='Perrson')], - site_ownership_code = "LBNL-MP", - access_limitations = ['UNL'], - publication_date = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0), # what should this be? - site_url = "https://next-gen.materialsproject.org/materials" -) -> Record: - return Record(product_type, title, persons, site_ownership_code, access_limitations, publication_date, site_url) \ No newline at end of file diff --git a/legacy/mpcite/pipeline.py b/legacy/mpcite/pipeline.py deleted file mode 100644 index 3d2c8c7..0000000 --- a/legacy/mpcite/pipeline.py +++ /dev/null @@ -1,105 +0,0 @@ -import os -import json -from elinkapi import Elink, Record -from elinkapi.record import RecordResponse -from dotenv import load_dotenv - -import requests -from elinkapi.utils import Validation - -from pymongo import MongoClient -import pymongo - -from timeit import default_timer as timer -import logging -import datetime -from doi_builder import * - -load_dotenv() # depends on the root directory from which you run your python scripts. - -review_endpoint = "https://review.osti.gov/elink2api/" - -prod_api = Elink(token = os.environ.get("elink_api_PRODUCTION_key")) -review_api = Elink(token = os.environ.get("elink_review_api_token"), target=review_endpoint) - -atlas_user = os.environ.get("atlas_user") -atlas_password = os.environ.get("atlas_password") -atlas_host = os.environ.get("atlas_host") -mongo_uri = f"mongodb+srv://{atlas_user}:{atlas_password}@{atlas_host}/" - -failed_osti_ids = [] - -cwd = os.getcwd() -path = "/json_pages/" - -for filename in os.listdir(cwd+path): - logging.debug(f"Now extracting {filename}") - file = open(cwd + path + filename, "r") - for line in file: - record = RecordResponse(**json.loads(line.strip())) - record.osti_id = record.doi.split('/')[1] - # for every record in the OSTI production environment: - # flag for update performance - update_success = False - - material_id = record.site_unique_id - - with MongoClient(mongo_uri) as client: # should I open this in or outside of the for loop? - coll = client["mp_core_blue"]["robocrys"] - res = coll.find_one({"material_id" : material_id}) - - if res != None: - robocrys_description = res["description"] - - # what if there is no document in robocrys found? - else: - logging.warning(f"No robocrys document was found to match the OSTI record: {record.osti_id}!") - - # if the description of the record on Elink doesnt match what is in the robocrys collection: - if res != None and record.description != robocrys_description: - # directly update the description of the record via the record response - record.description = robocrys_description - - # and directly update the identifier for sponsoring org - for entry in record.organizations: - if entry.type == "SPONSOR": - entry.identifiers = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] - break - - try: - # send update to the record with the record response # update_record(osti_id, record, state="save") - record_response = prod_api.update_record(record.osti_id, record, state="save") - update_success = True - - except: - logging.debug("The update failed to save!") - # add the osti_id of the failed update to failed_osti_ids - failed_osti_ids.append(record.osti_id) - - # if the update worked... - if update_success == True: - # save the record response returned with sending the update, done above - # convert that record response into a doi_model - doi_model = RecordResponse_to_doi_model(record) #change later to record response - - # upload that doi_model as a document to the new doi collection in mp_core - # what is the collection - with MongoClient() as local_client: - collection = local_client["dois_test"]["dois"] - x = collection.insert_one(doi_model.dict(by_alias=True)).inserted_id - - # else if the description on Elink matches what is in the robocrys collection: - elif record.description == robocrys_description: - # convert that record into a doi_model - doi_model = RecordResponse_to_doi_model(record) - - # upload that doi_model as a document to the new doi collection in mp_core, no updated needed! - with MongoClient() as local_client: - collection = local_client["dois_test"]["dois"] - x = collection.insert_one(doi_model).inserted_id - -cwd = os.getcwd() -path = f"/files/failed_osti_ids_{str(datetime.datetime.now())}.txt" -with open(cwd+path, 'w') as output: # change filepath as needed - for id in failed_osti_ids: - output.write(str(id) + '\n') # i'm pretty sure it's a string already though... \ No newline at end of file From a66640b637d9ad5062a4ac5afd8f2a1f6f047b8d Mon Sep 17 00:00:00 2001 From: HugoOnghai Date: Tue, 5 Aug 2025 12:16:52 -0700 Subject: [PATCH 17/26] Clean Up branch for PR (after rebasing after lint + release PRs were accepted) --- .github/workflows/release.yml | 4 - legacy/mpcite/models.py | 302 ++++++++++++++++++++++++++++------ 2 files changed, 256 insertions(+), 50 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 41c038d..e5d2a28 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -66,11 +66,7 @@ jobs: uses: ad-m/github-push-action@master with: github_token: ${{ secrets.GITHUB_TOKEN }} -<<<<<<< HEAD branch: master - name: Build and Deploy! run: uvx mkdocs gh-deploy -======= -# integrate mkdocs at some point ->>>>>>> a9fa948 (Cleaning up new-core PR, restore workflows and legacy stuff) diff --git a/legacy/mpcite/models.py b/legacy/mpcite/models.py index e4d055f..b2fab65 100644 --- a/legacy/mpcite/models.py +++ b/legacy/mpcite/models.py @@ -1,65 +1,101 @@ -from pydantic import BaseModel, Field, ConfigDict +from pydantic import BaseModel, Field from typing import List, Dict, Optional -import datetime +from datetime import datetime from enum import Enum import bibtexparser -from elinkapi import Elink, Record -from elinkapi.record import RecordResponse, AccessLimitation, JournalType -from elinkapi.geolocation import Geolocation -from elinkapi.identifier import Identifier -from elinkapi.related_identifier import RelatedIdentifier -from elinkapi.person import Person -from elinkapi.organization import Organization - -class TestClass(RecordResponse): - ... - # stuff + + +class ConnectionModel(BaseModel): + endpoint: str = Field(..., title="URL Endpoint of the connection") + username: str = Field(..., title="User Name") + password: str = Field(..., title="Password") + + +class RoboCrysModel(BaseModel): + material_id: str + last_updated: datetime + description: Optional[str] = None + error: Optional[str] = None + + @classmethod + def get_default_description(cls): + return ( + "Computed materials data using density " + "functional theory calculations. These calculations determine " + "the electronic structure of bulk materials by solving " + "approximations to the Schrodinger equation. For more " + "information, see https://materialsproject.org/docs/calculations" + ) + + +class MaterialModel(BaseModel): + last_updated: datetime = Field( + None, title="timestamp for the most recent calculation" + ) + updated_at: datetime = Field(None, title="alternative to last_updated") + created_at: datetime = Field( + None, + description="creation time for this material defined by when the first structure " + "optimization calculation was run", + ) + task_id: str = Field( + "", title="task id for this material. Also called the material id" + ) + # pretty_formula: str = Field(..., title="clean representation of the formula") + pretty_formula: str = Field(..., title="clean representation of the formula") + chemsys: str + class ELinkGetResponseModel(BaseModel): - osti_id: Optional[int] = Field(...) + osti_id: Optional[str] = Field(...) dataset_type: str = Field(default="SM") title: str = Field(...) - persons: List[Person] + creators: str = Field(default="Kristin Persson") # replace with authors contributors: List[Dict[str, str]] = Field( default=[{"first_name": "Materials", "last_name": "Project"}], description="List of Dict of first name, last name mapping", ) # no contributor - publication_date: datetime.date + product_nos: str = Field(..., title="MP id") + accession_num: str = Field(..., title="MP id") + contract_nos: str = Field("AC02-05CH11231; EDCBEE") + originating_research_org: str = Field( + default="Lawrence Berkeley National Laboratory (LBNL), Berkeley, CA (United States)" + ) + publication_date: str = Field(...) + language: str = Field(default="English") + country: str = Field(default="US") + sponsor_org: str = Field( + default="USDOE Office of Science (SC), Basic Energy Sciences (BES) (SC-22)" + ) site_url: str = Field(...) + contact_name: str = Field(default="Kristin Persson") + contact_org: str = Field(default="LBNL") + contact_email: str = Field(default="feedback@materialsproject.org") + contact_phone: str = Field(default="+1(510)486-7218") + related_resource: str = Field("https://materialsproject.org/citing") + contributor_organizations: str = Field(default="MIT; UC Berkeley; Duke; U Louvain") + subject_categories_code: str = Field(default="36 MATERIALS SCIENCE") + keywords: str = Field(...) + description: str = Field(default="") doi: dict = Field( {}, title="DOI info", description="Mainly used during GET request" ) - mp_id: str | None = None - keywords: List[str] = None @classmethod - def from_elinkapi_record(cls, R): - gotResponse = ELinkGetResponseModel( - osti_id = R.osti_id, - title = R.title, - persons = R.persons, - # assume default contributors for now, creators vs contributors? - publication_date = R.publication_date, - site_url = R.site_url, - doi = {"doi": R.doi}, - mp_id = next((id.value for id in R.identifiers if id.type == 'RN'), None), - keywords = R.keywords - ) - - return gotResponse - - def get_title(self): - formula = self.keywords[1] + def get_title(cls, material: MaterialModel): + formula = material.pretty_formula return "Materials Data on %s by Materials Project" % formula - def get_site_url(self): - return "https://materialsproject.org/materials/%s" % self.mp_id + @classmethod + def get_site_url(cls, mp_id): + return "https://materialsproject.org/materials/%s" % mp_id - def get_keywords(self): - # keywords = "; ".join( - # ["crystal structure", material.pretty_formula, material.chemsys] - # ) - return self.keywords + @classmethod + def get_keywords(cls, material): + keywords = "; ".join( + ["crystal structure", material.pretty_formula, material.chemsys] + ) + return keywords @classmethod def get_default_description(cls): @@ -77,11 +113,11 @@ def custom_to_dict(cls, elink_record) -> dict: return elink_record.dict(exclude={"osti_id", "doi"}) else: return elink_record.dict(exclude={"doi"}) - + class ElinkResponseStatusEnum(Enum): - SUCCESS = "SUCCESS" - FAILED = "FAILURE" + SUCCESS = "SUCCESS" + FAILED = "FAILURE" class ELinkPostResponseModel(BaseModel): @@ -106,4 +142,178 @@ def generate_doi_record(self): ) doi_collection_record.set_status(status=self.doi["@status"]) doi_collection_record.last_validated_on = datetime.now() - return doi_collection_record \ No newline at end of file + return doi_collection_record + + +class DOIRecordStatusEnum(str, Enum): + COMPLETED = "COMPLETED" + PENDING = "PENDING" + FAILURE = "FAILURE" + INIT = "INIT" + + +class DOIRecordModel(BaseModel): + material_id: str = Field(...) + doi: str = Field(default="") + bibtex: Optional[str] = None + status: DOIRecordStatusEnum + valid: bool = Field(False) + last_updated: datetime = Field( + default=datetime.now(), + title="DOI last updated time.", + description="Last updated is defined as either a Bibtex or status change.", + ) + created_at: datetime = Field( + default=datetime.now(), + title="DOI Created At", + description="creation time for this DOI record", + ) + last_validated_on: datetime = Field( + default=datetime.now(), + title="Date Last Validated", + description="Date that this data is last validated, " "not necessarily updated", + ) + elsevier_updated_on: datetime = Field( + default=datetime.now(), + title="Date Elsevier is updated", + description="If None, means never uploaded to elsevier", + ) + error: Optional[str] = Field( + default=None, description="None if no error, else error message" + ) + + class Config: + use_enum_values = True + + def set_status(self, status): + self.status = status + + def get_osti_id(self): + if self.doi is None or self.doi == "": + return "" + else: + return self.doi.split("/")[-1] + + def get_bibtex_abstract(self): + try: + if self.bibtex is None: + return "" + bib_db: bibtexparser.bibdatabase.BibDatabase = bibtexparser.loads( + self.bibtex + ) + if bib_db.entries: + return bib_db.entries[0]["abstractnote"] + except Exception as e: + print(e) + return "" + + +class OSTIDOIRecordModel(DOIRecordModel): + material_id: str = Field(...) + doi: str = Field(default="") + bibtex: Optional[str] = None + valid: bool = Field(False) + last_updated: datetime = Field( + default=datetime.now(), + title="DOI last updated time.", + description="Last updated is defined as either a Bibtex or status change.", + ) + + +class ElsevierPOSTContainerModel(BaseModel): + identifier: str = Field(default="", title="mp_id") + source: str = "MATERIALS_PROJECT" + date: str = datetime.now().date().isoformat().__str__() + title: str + description: str = "" + doi: str + authors: List[str] = ["Kristin Persson"] + url: str + type: str = "dataset" + dateAvailable: str = datetime.now().date().isoformat().__str__() + dateCreated: str = datetime.now().date().isoformat().__str__() + version: str = "1.0.0" + funding: str = "USDOE Office of Science (SC), Basic Energy Sciences (BES) (SC-22)" + language: str = "en" + method: str = "Materials Project" + accessRights: str = "Public" + contact: str = "Kristin Persson " + dataStandard: str = "https://materialsproject.org/citing" + howToCite: str = "https://materialsproject.org/citing" + subjectAreas: List[str] = ["36 MATERIALS SCIENCE"] + keywords: List[str] + institutions: List[str] = ["Lawrence Berkeley National Laboratory"] + institutionIds: List[str] = ["AC02-05CH11231; EDCBEE"] + spatialCoverage: List[str] = [] + temporalCoverage: List[str] = [] + references: List[str] = ["https://materialsproject.org/citing"] + relatedResources: List[str] = ["https://materialsproject.org/citing"] + location: str = "1 Cyclotron Rd, Berkeley, CA 94720" + childContainerIds: List[str] = [] + + @classmethod + def get_url(cls, mp_id): + return "https://materialsproject.org/materials/%s" % mp_id + + @classmethod + def get_keywords(cls, material: MaterialModel): + return ["crystal structure", material.pretty_formula, material.chemsys] + + @classmethod + def get_default_description(cls): + return ( + "Computed materials data using density " + "functional theory calculations. These calculations determine " + "the electronic structure of bulk materials by solving " + "approximations to the Schrodinger equation. For more " + "information, see https://materialsproject.org/docs/calculations" + ) + + @classmethod + def get_date_created(cls, material: MaterialModel) -> str: + return material.created_at.date().__str__() + + @classmethod + def get_date_available(cls, material: MaterialModel) -> str: + return material.created_at.date().__str__() + + @classmethod + def get_title(cls, material: MaterialModel) -> str: + return material.pretty_formula + + @classmethod + def from_material_model(cls, material: MaterialModel, doi: str, description: str): + model = ElsevierPOSTContainerModel( + identifier=material.task_id, + title=material.pretty_formula, + doi=doi, + url="https://materialsproject.org/materials/%s" % material.task_id, + keywords=["crystal structure", material.pretty_formula, material.chemsys], + date=datetime.now().date().__str__(), + dateCreated=material.created_at.date().__str__(), + dateAvailable=ElsevierPOSTContainerModel.get_date_available(material), + description=description, + ) + return model + + +class ExplorerGetJSONResponseModel(BaseModel): + osti_id: str + title: str + report_number: str + doi: str + product_type: str + language: str + country_publication: str + description: str + site_ownership_code: str + publication_date: str + entry_date: str + contributing_organizations: str + authors: List[str] + subjects: List[str] + contributing_org: str + doe_contract_number: str + sponsor_orgs: List[str] + research_orgs: List[str] + links: List[Dict[str, str]] From 1cef22272fe72f32be12ec5ca3eec2a0561391f9 Mon Sep 17 00:00:00 2001 From: HugoOnghai Date: Tue, 5 Aug 2025 12:18:19 -0700 Subject: [PATCH 18/26] Cleaned up .gitignore --- .gitignore | 104 +++-------------------------------------------------- 1 file changed, 4 insertions(+), 100 deletions(-) diff --git a/.gitignore b/.gitignore index b4acfca..8241d4c 100644 --- a/.gitignore +++ b/.gitignore @@ -56,65 +56,6 @@ cover/ *.mo *.pot -# Django stuff: -# https://github.com/github/gitignore/blob/main/Python.gitignore -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[codz] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py.cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - # Django stuff: *.log local_settings.py @@ -135,31 +76,8 @@ docs/_build/ .pybuilder/ target/ -# Jupyter Notebook -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - # Jupyter Notebook .ipynb_checkpoints -<<<<<<< HEAD -<<<<<<< HEAD -======= ->>>>>>> 9359e8d ((after rebase) updated gitignore and moved linting to new branch) # IPython profile_default/ @@ -168,7 +86,7 @@ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: -# .python-version +.python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. @@ -292,20 +210,6 @@ __marimo__/ # Streamlit .streamlit/secrets.toml -# json files for storing production records -*.json -.env -<<<<<<< HEAD -<<<<<<< HEAD -/json_pages -/notebooks -/test_json_pages -======= -/json_pages ->>>>>>> 9359e8d ((after rebase) updated gitignore and moved linting to new branch) -======= -/json_pages - -/test_json_pages -/notebooks ->>>>>>> 5830467 (Remove accidentally committed files) +json_pages/ +notebooks/ +test_json_pages/ \ No newline at end of file From a4ab306cad27eda3994434e00fa6452f30be413b Mon Sep 17 00:00:00 2001 From: HugoOnghai Date: Tue, 5 Aug 2025 16:50:34 -0700 Subject: [PATCH 19/26] Addresses comments raised in PR for core.py --- src/mp_cite/core.py | 130 +++++++++++++++++++++++++++++++------------- 1 file changed, 92 insertions(+), 38 deletions(-) diff --git a/src/mp_cite/core.py b/src/mp_cite/core.py index 24be6b3..5e9c5bc 100644 --- a/src/mp_cite/core.py +++ b/src/mp_cite/core.py @@ -8,6 +8,10 @@ from elinkapi.utils import Validation from datetime import datetime +import pytz + +from pydantic import Field +from typing import List, Literal OstiID: TypeAlias = int @@ -20,6 +24,18 @@ def find_out_of_date_doi_entries( doi_db: str, doi_collection: str, ) -> list[OstiID]: + """ + find_out_of_date_doi_entries queries MP's mongo collections to find all robocrys documents that were updated less recently than the latest doi document + + :rc_client is the MongoClient used to access the robocrys collection + :doi_client is the MongoClient used to access the doi collection (since a new doi collection is planned, they clients are passed separately, though in the future they may be the same client.) + :robocrys_db is the name of the database the robocrys collection is in + :robocrys_collection is the name of the robocrys collection + :doi_db is the name of the database the doi collection is in + :doi_collection is the name of the doi collection + + returns a list containing all OSTI IDs associated with out-of-date doi entries. + """ robocrys = rc_client[robocrys_db][robocrys_collection] dois = doi_client[doi_db][doi_collection] @@ -54,17 +70,20 @@ def find_out_of_date_doi_entries( def update_existing_osti_record( - elinkapi: Elink, - osti_id: OstiID, - new_values: dict + elinkapi: Elink, osti_id: OstiID, new_values: dict ) -> RecordResponse: + """ + update_existing_osti_record allows users to provide a dictionary of keywords and new values, which will replace the old values under the same keywords in the record with the given osti id + + :elinkapi is the instance of the elinkapi associated with the environment in which the record is held (e.g. either production or review environment) + :osti_id is the osti id of the record which ought to be updated + :new_values is a dictionary of keywords (which should exist in ELink's record model) and new value pairs. + """ + record_on_elink = elinkapi.get_single_record(osti_id) - for keyword in new_values.keys(): - try: - setattr(record_on_elink, keyword, new_values[keyword]) - except ValueError: - print("Extraneous keywords found in the dictionary that do not correspond to attributes in the ELink API's record class.") + for keyword in new_values: + setattr(record_on_elink, keyword, new_values[keyword]) # assume the use with fix the sponsor identifier bug before calling the update function # # fix the issue with the sponsor organization's identifiers @@ -73,27 +92,49 @@ def update_existing_osti_record( # entry.identifiers = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] # break - return elinkapi.update_record(osti_id, record_on_elink, state="save") # user should use update_state_of_osti_record to submit instead + return elinkapi.update_record( + osti_id, record_on_elink, state="save" + ) # user should use update_state_of_osti_record to submit instead def submit_new_osti_record( elinkapi: Elink, - new_record: Record, - state = "submit", # assuming there is no need to both with saving. just send new record to osti when its ready for submission. also assume bug with DOE contract number identifier in sponsor organization is accounted for + new_values: dict, + state="submit", # assuming there is no need to both with saving. just send new record to osti when its ready for submission. also assume bug with DOE contract number identifier in sponsor organization is accounted for ) -> RecordResponse: + """ + submit_new_osti_record generates a new record based on the provided keyword-value pairs in the new_values dict and the default minimum DA Record metadata necessary for submission + + :elinkapi is the elinkapi (see previous) + :new_values is the dictionary of keywords and values which want to be included in the submitted record (besides or in lieu of default values). The title MUST be provided. + :state defaults to "submit" but the user can simply "save" if desired. + + returns the record response after submission + """ + # template for all repeated stuff # only submit + new_record = MinimumDARecord( + **new_values + ) # record is an instance of the MinimumDARecord model which gives default values to all necessary fields (EXCEPT Title) record_response = elinkapi.post_new_record(new_record, state) return record_response def update_state_of_osti_record( - elinkapi: Elink, - osti_id: OstiID, - new_state = "submit" + elinkapi: Elink, osti_id: OstiID, new_state: Literal["save", "submit"] ) -> RecordResponse: record = elinkapi.get_single_record(osti_id) + """ + update_state_of_osti_record allows a user to update the state of a record with provided osti_id to either "save" or "submit" (the two valid states) + + :elinkapi is the elinkapi (see previous) + :osti_id is the OSTI ID associated with the record of which to update state. + :new_state is a Literal object, in this case a subtype of strings (either "save" or "submit"). + + returns the record response after updating the state. + """ # assuming that the user will handle the sponsor identifier bug before calling this function # # fix the issue with the sponsor organization's identifiers @@ -105,30 +146,43 @@ def update_state_of_osti_record( return elinkapi.update_record(osti_id, record, new_state) -def delete_osti_record( - elinkapi: Elink, - osti_id: OstiID, - reason: str -) -> RecordResponse: - """Delete a record by its OSTI ID.""" - response = requests.delete(f"{elinkapi.target}records/{osti_id}?reason={reason}", headers={"Authorization": f"Bearer {elinkapi.token}"}) +def delete_osti_record(elinkapi: Elink, osti_id: OstiID, reason: str) -> bool: + """ + Delete a record by its OSTI ID. + + :elinkapi is the elinkapi + :osti_id is the osti_id associated with the record which ought to be deleted + :reason is a str object which explains in words why the record is to be deleted (necessary for the http request) + + returns true if deleted successfully, else false, which indicates a bad status_code + """ + response = requests.delete( + f"{elinkapi.target}records/{osti_id}?reason={reason}", + headers={"Authorization": f"Bearer {elinkapi.token}"}, + ) Validation.handle_response(response) return response.status_code == 204 # True if deleted successfully -def emptyReviewAPI(reason, review_api): - allDeleted = True - for record in review_api.query_records(): - delete_osti_record(review_api, record.osti_id, reason) - -def make_minimum_record_to_fully_release( - title, # required to make record - product_type = "DA", # required to make record - organizations = [Organization(type='RESEARCHING', name='LBNL Materials Project (LBNL-MP)'), - Organization(type='SPONSOR', name='TEST SPONSOR ORG', identifiers=[{"type": 'CN_DOE', "value": 'AC02-05CH11231'}])], # sponsor org is necessary for submission - persons = [Person(type='AUTHOR', last_name='Perrson')], - site_ownership_code = "LBNL-MP", - access_limitations = ['UNL'], - publication_date = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0), # what should this be? - site_url = "https://next-gen.materialsproject.org/materials" -) -> Record: - return Record(product_type, title, persons, site_ownership_code, access_limitations, publication_date, site_url) \ No newline at end of file + +class MinimumDARecord(Record): + product_type: str = Field(default="DA") + title: str # Required + organizations: List[Organization] = Field( + default_factory=lambda: [ + Organization(type="RESEARCHING", name="LBNL Materials Project (LBNL-MP)"), + Organization( + type="SPONSOR", + name="TEST SPONSOR ORG", + identifiers=[{"type": "CN_DOE", "value": "AC02-05CH11231"}], + ), # sponsor org is necessary for submission + ] + ) + persons: List[Person] = Field( + default_factory=lambda: [Person(type="AUTHOR", last_name="Persson")] + ) + site_ownership_code: str = Field(default="LBNL-MP") + access_limitations: List[str] = Field(default_factory=lambda: ["UNL"]) + publication_date: datetime = Field( + default_factory=lambda: datetime.now(tz=pytz.UTC) + ) + site_url: str = Field(default="https://next-gen.materialsproject.org/materials") From ed8906704d596dddd24b68d35cbdd3bf91e47150 Mon Sep 17 00:00:00 2001 From: HugoOnghai Date: Wed, 6 Aug 2025 09:42:46 -0700 Subject: [PATCH 20/26] Removed extraneous files from PR --- src/mp_cite/pipeline.py | 105 --------------------------------- src/mp_cite/reset.py | 26 -------- src/mp_cite/send_collection.py | 79 ------------------------- src/mp_cite/test_core.py | 84 -------------------------- 4 files changed, 294 deletions(-) delete mode 100644 src/mp_cite/pipeline.py delete mode 100644 src/mp_cite/reset.py delete mode 100644 src/mp_cite/send_collection.py delete mode 100644 src/mp_cite/test_core.py diff --git a/src/mp_cite/pipeline.py b/src/mp_cite/pipeline.py deleted file mode 100644 index 3d2c8c7..0000000 --- a/src/mp_cite/pipeline.py +++ /dev/null @@ -1,105 +0,0 @@ -import os -import json -from elinkapi import Elink, Record -from elinkapi.record import RecordResponse -from dotenv import load_dotenv - -import requests -from elinkapi.utils import Validation - -from pymongo import MongoClient -import pymongo - -from timeit import default_timer as timer -import logging -import datetime -from doi_builder import * - -load_dotenv() # depends on the root directory from which you run your python scripts. - -review_endpoint = "https://review.osti.gov/elink2api/" - -prod_api = Elink(token = os.environ.get("elink_api_PRODUCTION_key")) -review_api = Elink(token = os.environ.get("elink_review_api_token"), target=review_endpoint) - -atlas_user = os.environ.get("atlas_user") -atlas_password = os.environ.get("atlas_password") -atlas_host = os.environ.get("atlas_host") -mongo_uri = f"mongodb+srv://{atlas_user}:{atlas_password}@{atlas_host}/" - -failed_osti_ids = [] - -cwd = os.getcwd() -path = "/json_pages/" - -for filename in os.listdir(cwd+path): - logging.debug(f"Now extracting {filename}") - file = open(cwd + path + filename, "r") - for line in file: - record = RecordResponse(**json.loads(line.strip())) - record.osti_id = record.doi.split('/')[1] - # for every record in the OSTI production environment: - # flag for update performance - update_success = False - - material_id = record.site_unique_id - - with MongoClient(mongo_uri) as client: # should I open this in or outside of the for loop? - coll = client["mp_core_blue"]["robocrys"] - res = coll.find_one({"material_id" : material_id}) - - if res != None: - robocrys_description = res["description"] - - # what if there is no document in robocrys found? - else: - logging.warning(f"No robocrys document was found to match the OSTI record: {record.osti_id}!") - - # if the description of the record on Elink doesnt match what is in the robocrys collection: - if res != None and record.description != robocrys_description: - # directly update the description of the record via the record response - record.description = robocrys_description - - # and directly update the identifier for sponsoring org - for entry in record.organizations: - if entry.type == "SPONSOR": - entry.identifiers = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] - break - - try: - # send update to the record with the record response # update_record(osti_id, record, state="save") - record_response = prod_api.update_record(record.osti_id, record, state="save") - update_success = True - - except: - logging.debug("The update failed to save!") - # add the osti_id of the failed update to failed_osti_ids - failed_osti_ids.append(record.osti_id) - - # if the update worked... - if update_success == True: - # save the record response returned with sending the update, done above - # convert that record response into a doi_model - doi_model = RecordResponse_to_doi_model(record) #change later to record response - - # upload that doi_model as a document to the new doi collection in mp_core - # what is the collection - with MongoClient() as local_client: - collection = local_client["dois_test"]["dois"] - x = collection.insert_one(doi_model.dict(by_alias=True)).inserted_id - - # else if the description on Elink matches what is in the robocrys collection: - elif record.description == robocrys_description: - # convert that record into a doi_model - doi_model = RecordResponse_to_doi_model(record) - - # upload that doi_model as a document to the new doi collection in mp_core, no updated needed! - with MongoClient() as local_client: - collection = local_client["dois_test"]["dois"] - x = collection.insert_one(doi_model).inserted_id - -cwd = os.getcwd() -path = f"/files/failed_osti_ids_{str(datetime.datetime.now())}.txt" -with open(cwd+path, 'w') as output: # change filepath as needed - for id in failed_osti_ids: - output.write(str(id) + '\n') # i'm pretty sure it's a string already though... \ No newline at end of file diff --git a/src/mp_cite/reset.py b/src/mp_cite/reset.py deleted file mode 100644 index 350b50c..0000000 --- a/src/mp_cite/reset.py +++ /dev/null @@ -1,26 +0,0 @@ -from mpcite.core import * -from mpcite.doi_builder import RecordResponse_to_doi_model, upload_doi_document_model_to_collection -import os -import json -from dotenv import load_dotenv - -load_dotenv() # depends on the root directory from which you run your python scripts. - -review_endpoint = "https://review.osti.gov/elink2api/" - -prod_api = Elink(token = os.environ.get("elink_api_PRODUCTION_key")) -review_api = Elink(token = os.environ.get("elink_review_api_token"), target=review_endpoint) - -cwd = os.getcwd() -path = "/json_pages/page_number_1000.0" # IT'S ONLY DOING ONE FILE RIGHT NOW -file = open(cwd + path, "r") - -atlas_user = os.environ.get("atlas_user") -atlas_password = os.environ.get("atlas_password") -atlas_host = os.environ.get("atlas_host") -mongo_uri = f"mongodb+srv://{atlas_user}:{atlas_password}@{atlas_host}/" - -# emptyReviewAPI("Testing", review_api) - -with MongoClient() as client: - client.dois_test.dois.delete_many({}, comment="Testing") \ No newline at end of file diff --git a/src/mp_cite/send_collection.py b/src/mp_cite/send_collection.py deleted file mode 100644 index 0ce65a3..0000000 --- a/src/mp_cite/send_collection.py +++ /dev/null @@ -1,79 +0,0 @@ -from pathlib import Path -from xml.dom.minidom import parseString -from dicttoxml import dicttoxml -from mpcite.doi_builder import DOIBuilder -import json -from monty.json import MontyDecoder -from pydantic import BaseModel, Field -from typing import List - -default_description = ( - "Computed materials data using density functional theory calculations. These " - "calculations determine the electronic structure of bulk materials by solving " - "approximations to the Schrodinger equation. For more information, " - "see https://materialsproject.org/docs/calculations" -) - - -class CollectionsModel(BaseModel): - title: str = Field(default="Sample Title") - product_type: str = Field(default="DC") - relidentifiersblock: List[List[str]] = Field() - contributors: List[dict] - description: str = Field(default=default_description) - site_url: str = Field(default="https://materialsproject.org/") - - -config_file = Path("/Users/michaelwu/Desktop/projects/MPCite/files/config_prod.json") - -bld: DOIBuilder = json.load(config_file.open("r"), cls=MontyDecoder) -bld.config_file_path = config_file.as_posix() - -records = [ - CollectionsModel( - relidentifiersblock=[["mp-1", "mp-2", "mp-1"]], - contributors=[ - { - "first_name": "Michael", - "last_name": "Wu", - "email": "wuxiaohua1011@berkeley.edu", - } - ], - ).dict(), - CollectionsModel( - relidentifiersblock=[["mp-21"], ["mp-22"]], - contributors=[ - { - "first_name": "Michael", - "last_name": "Wu", - "email": "wuxiaohua1011@berkeley.edu", - } - ], - ).dict(), -] - - -def my_item_func(x): - if x == "records": - return "record" - elif x == "contributors": - return "contributor" - elif x == "relidentifier_detail": - return "related_identifier" - elif x == "relidentifiersblock": - return "relidentifier_detail" - else: - return "item" - - -records_xml = parseString( - dicttoxml(records, custom_root="records", attr_type=False, item_func=my_item_func) -) - -for item in records_xml.getElementsByTagName("relidentifier_detail"): - item.setAttribute("type", "accession_num") - item.setAttribute("relationType", "Compiles") - -print(records_xml.toprettyxml()) -# response = bld.elink_adapter.post_collection(data=records_xml.toxml()) -# print(response) diff --git a/src/mp_cite/test_core.py b/src/mp_cite/test_core.py deleted file mode 100644 index 948d83c..0000000 --- a/src/mp_cite/test_core.py +++ /dev/null @@ -1,84 +0,0 @@ -from mpcite.core import * -from mpcite.doi_builder import RecordResponse_to_doi_model, upload_doi_document_model_to_collection -import os -import json -from dotenv import load_dotenv - -load_dotenv() # depends on the root directory from which you run your python scripts. - -review_endpoint = "https://review.osti.gov/elink2api/" - -prod_api = Elink(token = os.environ.get("elink_api_PRODUCTION_key")) -review_api = Elink(token = os.environ.get("elink_review_api_token"), target=review_endpoint) - -cwd = os.getcwd() -path = "/json_pages/page_number_1000.0" # IT'S ONLY DOING ONE FILE RIGHT NOW -file = open(cwd + path, "r") - -atlas_user = os.environ.get("atlas_user") -atlas_password = os.environ.get("atlas_password") -atlas_host = os.environ.get("atlas_host") -mongo_uri = f"mongodb+srv://{atlas_user}:{atlas_password}@{atlas_host}/" - -with MongoClient(mongo_uri) as real_client: - with MongoClient() as doi_client: # open the mongoclient outside of the for loop, is more efficient than opening and closing it repeatedly - dois = doi_client["dois_test"]["dois"] - - # for line in file: - # js = json.loads(line.strip()) - - # # temporarily fix the sponsor organization bug - # for entry in js["organizations"]: - # if entry["type"] == "SPONSOR": - # entry["identifiers"] = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] - - # my_record = Record(**js) - - # # make a post to the elink review environment - # saved_record = review_api.post_new_record(my_record, state="submit") - - # # make a doi document with saved_record - # doi_model = RecordResponse_to_doi_model(saved_record) - - # # now, add that doi to the local doi collection - # upload_doi_document_model_to_collection(doi_model, dois) - - # all_material_ids = [doc["material_id"] for doc in dois.find({}, {"_id": 0, "material_id": 1})] - - # for material_id in all_material_ids: - - # # query prod env for record with materials_id == site_unique_id - # record_from_prod = prod_api.query_records(site_unique_id=material_id) - - # if record_from_prod.total_rows != 1: - # print(f"ERROR: not unique Material_ID! {material_id}") - # raise - - # # make a doi_model from that data - # recordresponse_from_prod = RecordResponse_to_doi_model(record_from_prod.data[0]) - - # query_filter = {"material_id": material_id} - - # # Find existing document to preserve the osti_id - # existing_doc = dois.find_one(query_filter, {"osti_id": 1}) # only retrieve osti_id - - # if not existing_doc: - # print(f"ERROR: document with material_id {material_id} not found in `dois` collection.") - # raise - - # replacement_doc = recordresponse_from_prod.model_dump() - # replacement_doc["osti_id"] = existing_doc["osti_id"] - - # dois.replace_one(query_filter, replacement_doc) - - osti_OOD_list = find_out_of_date_doi_entries(real_client, doi_client, "mp_core_blue", "robocrys", "dois_test", "dois") - print(osti_OOD_list) - - for osti_id in osti_OOD_list: - material_id_to_update = review_api.get_single_record(osti_id).site_unique_id - - new_values = { - "description": "UPDATED ROBOCRYS DESCRIPTION: " + next(real_client["mp_core_blue"]["robocrys"].find({"material_id": material_id_to_update}, {"_id": 0, "description": 1}))["description"] - } - - update_existing_osti_record(review_api, osti_id, new_values) \ No newline at end of file From cf449f59aecea7297b323ab7e5ea595c3b6f02f0 Mon Sep 17 00:00:00 2001 From: HugoOnghai Date: Wed, 6 Aug 2025 10:37:04 -0700 Subject: [PATCH 21/26] Removed models.py and cleaned up doi_builder.py --- src/mp_cite/doi_builder.py | 203 +++++++------------------------------ src/mp_cite/models.py | 109 -------------------- 2 files changed, 38 insertions(+), 274 deletions(-) delete mode 100644 src/mp_cite/models.py diff --git a/src/mp_cite/doi_builder.py b/src/mp_cite/doi_builder.py index 713724c..04bf257 100644 --- a/src/mp_cite/doi_builder.py +++ b/src/mp_cite/doi_builder.py @@ -1,187 +1,60 @@ -''' -doi_builder.py -A doi collection must store the following information about a document: -- doi number -- title -- osti id (ELink's Unique Identifier) -- material id (MP's Unique Identifier) -- date of system entry date (Date (UTC) of this revision's inception) -- date of last update (date edited or date_submitted_to_osti_last) (take from ELink) -- workflow status and the date (?) of each step: - - SA, saved, in a holding state, not to be processed - - SR, submit to releasing official "released_to_osti_date, as entered by releasing official" - - SO, submit to OSTI - - SF, submitted but failed validation - - SX, submitted but failed to release - - SV, submitted and validated - - R, released -- - -Here is an example of RecordResponse -RecordResponse( - osti_id=2523296, - workflow_status='SA', - access_limitations=['UNL'], - access_limitation_other=None, - announcement_codes=None, - availability=None, - edition=None, - volume=None, - - # Identifiers - identifiers=[ - Identifier(type='CN_NONDOE', value='EDCBEE'), - Identifier(type='CN_DOE', value='AC02-05CH11231'), - Identifier(type='RN', value='mp-1037659'), - ], - - # People involved - persons=[ - Person( - type='CONTACT', - first_name='Kristin', - last_name='Persson', - phone='+1(510)486-7218', - email=['feedback@materialsproject.org'], - affiliations=[ - Affiliation(name='LBNL') - ] - ) - ], - - # Organizations - organizations=[ - Organization(name='The Materials Project', type='CONTRIBUTING', contributor_type='ResearchGroup'), - Organization(name='LBNL Materials Project', type='RESEARCHING'), - Organization(name='Lawrence Berkeley National Laboratory (LBNL), Berkeley, CA (United States)', type='RESEARCHING'), - Organization(name='USDOE Office of Science (SC), Basic Energy Sciences (BES) (SC-22)', type='SPONSOR'), - Organization(name='MIT', type='CONTRIBUTING', contributor_type='Other'), - Organization(name='UC Berkeley', type='CONTRIBUTING', contributor_type='Other'), - Organization(name='Duke', type='CONTRIBUTING', contributor_type='Other'), - Organization(name='U Louvain', type='CONTRIBUTING', contributor_type='Other'), - ], - - # Metadata - country_publication_code='US', - doe_supported_flag=False, - doi='10.17188/1714845', - edit_reason='Record updated upon request of LBNL-MP to remove authors and replace with a single collaborator.', - format_information='', - invention_disclosure_flag=None, - paper_flag=False, - peer_reviewed_flag=False, - product_type='DA', - publication_date=datetime.date(2020, 4, 30), - publication_date_text='04/30/2020', - site_url='https://materialsproject.org/materials/mp-1037659', - site_ownership_code='LBNL-MP', - site_unique_id='mp-1037659', - subject_category_code=['36'], - title='Materials Data on RbYMg30O32 by Materials Project', - - # Description - description=""" - RbMg₃₀YO₃₂ is Molybdenum Carbide MAX Phase-derived and crystallizes in the tetragonal P4/mmm space group. - Rb¹⁺ is bonded to six O²⁻ atoms to form RbO₆ octahedra... - (Truncated here for brevity, full description is included in original) - """, - - keywords=['crystal structure', 'RbYMg30O32', 'Mg-O-Rb-Y'], - languages=['English'], - related_doc_info='https://materialsproject.org/citing', - - # Media - media=[ - MediaInfo( - media_id=1908478, - osti_id=2523296, - status='C', - mime_type='text/html', - files=[ - MediaFile( - media_file_id=12017281, - media_type='O', - url='https://materialsproject.org/materials/mp-1037659' - ), - MediaFile( - media_file_id=12017284, - media_type='C', - mime_type='text/html', - media_source='OFF_SITE_DOWNLOAD' - ) - ] - ) - ], - - # Audit logs - audit_logs=[ - AuditLog( - messages=['Revision status is not correct, found SA'], - status='FAIL', - type='RELEASER', - audit_date=datetime.datetime(2025, 6, 30, 22, 30, 24, 865000, tzinfo=TzInfo(UTC)) - ) - ], - - # Timestamps - date_metadata_added=datetime.datetime(2025, 6, 30, 22, 30, 20, 495000, tzinfo=TzInfo(UTC)), - date_metadata_updated=datetime.datetime(2025, 6, 30, 22, 30, 22, 247000, tzinfo=TzInfo(UTC)), - - # Misc - revision=2, - added_by=139001, - edited_by=139001, - collection_type='DOE_LAB', - hidden_flag=False -) -''' - -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, Field from datetime import datetime -# TODO: change the field names to match ELINK -class doi_model(BaseModel): + +class DOIModel(BaseModel): # identifiers - doi: str = Field(description="The DOI number as allocated by OSTI") # can be taken from ELink API - title: str = Field(description="The title of the record") # can be taken from ELink API - osti_id: str = Field(description="The OSTI ID number allocated by OSTI to make the DOI number") # can be taken from ELink API - material_id: str # can be taken from Robocrys Collection or ELink API + doi: str = Field( + description="The DOI number as allocated by OSTI" + ) # can be taken from ELink API + title: str = Field( + description="The title of the record" + ) # can be taken from ELink API + osti_id: str = Field( + coerce_numbers_to_str=True, + description="The OSTI ID number allocated by OSTI to make the DOI number", + ) # can be taken from ELink API + material_id: str # can be taken from Robocrys Collection or ELink API # time stamps - date_metadata_added: datetime | None = Field(description="date_record_entered_onto_ELink") # can be taken from ELink API response - date_metadata_updated: datetime | None = Field(description="date_record_last_updated_on_Elink") + date_metadata_added: datetime | None = Field( + description="date_record_entered_onto_ELink" + ) # can be taken from ELink API response + date_metadata_updated: datetime | None = Field( + description="date_record_last_updated_on_Elink" + ) # status - workflow_status: str # can be taken from ELink API + workflow_status: str # can be taken from ELink API date_released: datetime | None = Field(description="") - date_submitted_to_osti_first: datetime = Field(description="date record was first submitted to OSTI for publication, maintained internally by E-Link") - date_submitted_to_osti_last: datetime = Field(description="most recent date record information was submitted to OSTI. Maintained internally by E-Link") - publication_date: datetime | None = Field(description="") # labelled as publication_date in RecordResponse of ELink API + date_submitted_to_osti_first: datetime = Field( + description="date record was first submitted to OSTI for publication, maintained internally by E-Link" + ) + date_submitted_to_osti_last: datetime = Field( + description="most recent date record information was submitted to OSTI. Maintained internally by E-Link" + ) + publication_date: datetime | None = Field( + description="" + ) # labelled as publication_date in RecordResponse of ELink API + # hypothetically post an update or submit a new record and receive the RecordResponse -def RecordResponse_to_doi_model(recordresponse): - ''' +def record_response_to_DOI_model(recordresponse): + """ turns a recordresponse, which is returned from a save, submission, post, etc. into a doi_model object - ''' + """ params = { "doi": recordresponse.doi, "title": recordresponse.title, "osti_id": str(recordresponse.osti_id), "material_id": recordresponse.site_unique_id, - "date_metadata_added": recordresponse.date_metadata_added, "date_metadata_updated": recordresponse.date_metadata_updated, - "workflow_status": recordresponse.workflow_status, "date_released": recordresponse.date_released, - # date_released_to_osti = recordresponse.released_to_osti_date, # what is the difference between these??? "Date record information was released to OSTI, as entered by releasing official." always seems to be none - "date_submitted_to_osti_first": recordresponse.date_submitted_to_osti_first, # date record was first submitted to OSTI for publication, maintained internally by E-Link - "date_submitted_to_osti_last": recordresponse.date_submitted_to_osti_last, # most recent date record information was submitted to OSTI. Maintained internally by E-Link. - "publication_date": recordresponse.publication_date + "date_submitted_to_osti_first": recordresponse.date_submitted_to_osti_first, # date record was first submitted to OSTI for publication, maintained internally by E-Link + "date_submitted_to_osti_last": recordresponse.date_submitted_to_osti_last, # most recent date record information was submitted to OSTI. Maintained internally by E-Link. + "publication_date": recordresponse.publication_date, } - return doi_model(**params) - -def upload_doi_document_model_to_collection(doi_model, collection): - x = collection.insert_one(doi_model.model_dump()).inserted_id - return x \ No newline at end of file + return DOIModel(**params) diff --git a/src/mp_cite/models.py b/src/mp_cite/models.py deleted file mode 100644 index e4d055f..0000000 --- a/src/mp_cite/models.py +++ /dev/null @@ -1,109 +0,0 @@ -from pydantic import BaseModel, Field, ConfigDict -from typing import List, Dict, Optional -import datetime -from enum import Enum -import bibtexparser -from elinkapi import Elink, Record -from elinkapi.record import RecordResponse, AccessLimitation, JournalType -from elinkapi.geolocation import Geolocation -from elinkapi.identifier import Identifier -from elinkapi.related_identifier import RelatedIdentifier -from elinkapi.person import Person -from elinkapi.organization import Organization - -class TestClass(RecordResponse): - ... - # stuff - -class ELinkGetResponseModel(BaseModel): - osti_id: Optional[int] = Field(...) - dataset_type: str = Field(default="SM") - title: str = Field(...) - persons: List[Person] - contributors: List[Dict[str, str]] = Field( - default=[{"first_name": "Materials", "last_name": "Project"}], - description="List of Dict of first name, last name mapping", - ) # no contributor - publication_date: datetime.date - site_url: str = Field(...) - doi: dict = Field( - {}, title="DOI info", description="Mainly used during GET request" - ) - mp_id: str | None = None - keywords: List[str] = None - - @classmethod - def from_elinkapi_record(cls, R): - gotResponse = ELinkGetResponseModel( - osti_id = R.osti_id, - title = R.title, - persons = R.persons, - # assume default contributors for now, creators vs contributors? - publication_date = R.publication_date, - site_url = R.site_url, - doi = {"doi": R.doi}, - mp_id = next((id.value for id in R.identifiers if id.type == 'RN'), None), - keywords = R.keywords - ) - - return gotResponse - - def get_title(self): - formula = self.keywords[1] - return "Materials Data on %s by Materials Project" % formula - - def get_site_url(self): - return "https://materialsproject.org/materials/%s" % self.mp_id - - def get_keywords(self): - # keywords = "; ".join( - # ["crystal structure", material.pretty_formula, material.chemsys] - # ) - return self.keywords - - @classmethod - def get_default_description(cls): - return ( - "Computed materials data using density " - "functional theory calculations. These calculations determine " - "the electronic structure of bulk materials by solving " - "approximations to the Schrodinger equation. For more " - "information, see https://materialsproject.org/docs/calculations" - ) - - @classmethod - def custom_to_dict(cls, elink_record) -> dict: - if elink_record.osti_id is None or elink_record.osti_id == "": - return elink_record.dict(exclude={"osti_id", "doi"}) - else: - return elink_record.dict(exclude={"doi"}) - - -class ElinkResponseStatusEnum(Enum): - SUCCESS = "SUCCESS" - FAILED = "FAILURE" - - -class ELinkPostResponseModel(BaseModel): - osti_id: str - accession_num: str - product_nos: str - title: str - contract_nos: str - other_identifying_nos: Optional[str] - doi: Dict[str, str] - status: ElinkResponseStatusEnum - status_message: Optional[str] - - def generate_doi_record(self): - doi_collection_record = DOIRecordModel( - material_id=self.accession_num, - doi=self.doi["#text"], - status=self.doi["@status"], - bibtex=None, - valid=True, - last_validated_on=datetime.now(), - ) - doi_collection_record.set_status(status=self.doi["@status"]) - doi_collection_record.last_validated_on = datetime.now() - return doi_collection_record \ No newline at end of file From 1218576c6f16bc95609d41ba04d99e2ac9de66ab Mon Sep 17 00:00:00 2001 From: HugoOnghai Date: Wed, 6 Aug 2025 14:03:00 -0700 Subject: [PATCH 22/26] Removed extraneous files for debugging from tests/ -- will let the testing-suite PR handle any pytest testing files, after new-core-to-rebase PR is merged. --- tests/file_to_jsonForUpload.py | 151 --------------------------------- tests/github_bug_report.py | 49 ----------- tests/manage_backfills.py | 49 ----------- tests/outputs.txt | 46 ---------- tests/prod_to_review.py | 120 -------------------------- tests/test_elink_api.py | 109 ------------------------ 6 files changed, 524 deletions(-) delete mode 100644 tests/file_to_jsonForUpload.py delete mode 100644 tests/github_bug_report.py delete mode 100644 tests/manage_backfills.py delete mode 100644 tests/outputs.txt delete mode 100644 tests/prod_to_review.py delete mode 100644 tests/test_elink_api.py diff --git a/tests/file_to_jsonForUpload.py b/tests/file_to_jsonForUpload.py deleted file mode 100644 index aa864e0..0000000 --- a/tests/file_to_jsonForUpload.py +++ /dev/null @@ -1,151 +0,0 @@ -import os -import json -from elinkapi import Elink, Record -from dotenv import load_dotenv - -import requests -from elinkapi.utils import Validation - -from pymongo import MongoClient -import pymongo - -from timeit import default_timer as timer - -load_dotenv() # depends on the root directory from which you run your python scripts. - -review_endpoint = "https://review.osti.gov/elink2api/" - -prod_api = Elink(token = os.environ.get("elink_api_PRODUCTION_key")) -review_api = Elink(token = os.environ.get("elink_review_api_token"), target=review_endpoint) - - -atlas_user = os.environ.get("atlas_user") -atlas_password = os.environ.get("atlas_password") -atlas_host = os.environ.get("atlas_host") -mongo_uri = f"mongodb+srv://{atlas_user}:{atlas_password}@{atlas_host}/" - -cwd = os.getcwd() -path = "/json_pages/page_number_4.0" # IT'S ONLY DOING ONE FILE RIGHT NOW -file = open(cwd + path, "r") - -update_counter = 0 -records_checked = 0 - -def delete_record(api, osti_id, reason): - """Delete a record by its OSTI ID.""" - response = requests.delete(f"{api.target}records/{osti_id}?reason={reason}", headers={"Authorization": f"Bearer {api.token}"}) - Validation.handle_response(response) - return response.status_code == 204 # True if deleted successfully - -def emptyReviewAPI(reason): - allDeleted = True - for record in review_api.query_records(): - delete_record(review_api, record.osti_id, reason) - -raise - -start = timer() - -# Post an updated json - -postUnedited = False - -for line in file: - js = json.loads(line.strip()) - - for entry in js["organizations"]: - if entry["type"] == "SPONSOR": - entry["identifiers"] = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] - - material_id = js["site_unique_id"] - - robocrys_description = js["description"] - - with MongoClient(mongo_uri) as client: - coll = client["mp_core_blue"]["robocrys"] - res = coll.find_one({"material_id" : material_id}) - records_checked += 1 - - if res != None: - robocrys_description = res["description"] - - # see if an update to the description is necessary, if it is, then update the description and post a new record. - if postUnedited or (robocrys_description != None and js["description"] != robocrys_description): #if a robocrys_description was found internally and it doesn't match what ELink has record... - js["description"] = "OLD WAS UPDATED, THEN IT WAS POSTED: " + robocrys_description - my_record = Record(**js) - - saved_record = None - try: - # The API will now return an error code on this call - # because "AAAA" is not a valid site_ownership_code - - saved_record = review_api.post_new_record(my_record, state="submit") - update_counter += 1 - - print(f"NEW RECORD POSTED: {saved_record.osti_id}") - raise - except: - print(f"Record failed to post!: {my_record.doi}. Robocrys Collection Had Description {robocrys_description[0:50]}... Prod_Env ELink Had {my_record.description[37:87]}...") - raise - - if update_counter >= 10000: - break - -end = timer() -print(f"Records Updated and/or Posted: {update_counter} \nRecords Checked in Total: {records_checked}. \nIt took {end - start} seconds") - -####################################################### -# JUST POST JSON, Then update posted json Later -# post_counter = 0 -# records_checked = 0 - -# for line in file: -# js = json.loads(line.strip()) - -# material_id = js["site_unique_id"] - -# # always post, no update -# my_record = Record(**js) - -# saved_record = None -# try: -# # The API will now return an error code on this call -# # because "AAAA" is not a valid site_ownership_code - -# # posts an unupdated record -# saved_record = review_api.post_new_record(my_record, "save") -# post_counter += 1 - -# print("\n\n NEW RECORD POSTED") -# print(saved_record) - -# robocrys_description = js["description"] - -# with MongoClient(mongo_uri) as client: -# coll = client["mp_core_blue"]["robocrys"] -# res = coll.find_one({"material_id" : material_id}) -# records_checked += 1 - -# if res != None: -# robocrys_description = res["description"] - -# if robocrys_description != None and js["description"] != robocrys_description: # if an update is needed -# # update the js["description"] -# js["description"] = "OLD WAS POSTED, THEN RECORD WITH NEW DESCRIPTION UPDATED IT: " + robocrys_description - -# # turn it into a new record -# new_updated_record = Record(**js) - -# # use that new record to update what was just posted -# review_api.update_record(saved_record.osti_id, new_updated_record, "save") - -# except: -# print("Record failed to post!") - -# if post_counter >= 10000: -# break - -# end = timer() -# print(f"Records Updated and/or Posted: {update_counter} \n Records Checked in Total: {records_checked}. It took {end - start} seconds") - -###################################################### \ No newline at end of file diff --git a/tests/github_bug_report.py b/tests/github_bug_report.py deleted file mode 100644 index 3151e6d..0000000 --- a/tests/github_bug_report.py +++ /dev/null @@ -1,49 +0,0 @@ -from elinkapi import Elink, Organization, Person, exceptions, Record -import os -from dotenv import load_dotenv -from datetime import datetime - -load_dotenv() - -prod_api = Elink(token = os.environ.get("elink_api_PRODUCTION_key")) -review_endpoint = "https://review.osti.gov/elink2api/" -review_api = Elink(token = os.environ.get("elink_review_api_token"), target=review_endpoint) - -# record_response = prod_api.get_single_record(1190959) # returns OSTI record response with OSTI ID = 1190959, which has a DOE Contract Number saved (AC02-05CH11231; EDCBEE) -# record_response_dict = record_response.model_dump(exclude_none=True) -# record_response_dict.pop("osti_id") # remove osti_id to allow post function - -# new_record = Record(**record_response_dict) # identical record with removed OSTI_ID -# for org in new_record.organizations: -# if org.type == "SPONSOR": -# print(org) -# org.identifiers = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] - -# # attempt to submit exact same record to review environment -# record_response_after_post = review_api.post_new_record(new_record, "save") # works after re-providing the DOE contract number - -# # next, attempt updating this record -# record_to_update = review_api.get_single_record(record_response_after_post.osti_id) -# record_to_update.title = "Updated Title For Materials Data" -# review_api.update_record(record_response_after_post.osti_id, record_to_update, "submit") - -required_fields = { - "product_type": "DA", - "title": "Testing if CN_DOE can be random", - "organizations": [Organization(type='RESEARCHING', name='LBNL Materials Project (LBNL-MP)'), - Organization(type='SPONSOR', name='TEST SPONSOR ORG', identifiers=[{"type": 'CN_DOE', "value": 'oiajdiwjdiwj'}])], - "persons": [Person(type='AUTHOR', last_name='Schmoe')], - "site_ownership_code": "LBNL-MP", - "access_limitations": ['UNL'], - "publication_date": datetime.now().replace(hour=0, minute=0, second=0, microsecond=0), - "site_url": "https://next-gen.materialsproject.org/materials" -} - -empty_record = Record(**required_fields) -print(f"SUBMITTED TO OSTI, FULLY VALIDATED:\n{review_api.get_single_record(2525614)}\n\n\nTRYING TO SUBMIT:\n{empty_record}") - -try: - saved_record = review_api.post_new_record(empty_record, "submit") -except exceptions.BadRequestException as ve: - print(ve.message) - print(ve.errors) \ No newline at end of file diff --git a/tests/manage_backfills.py b/tests/manage_backfills.py deleted file mode 100644 index a835456..0000000 --- a/tests/manage_backfills.py +++ /dev/null @@ -1,49 +0,0 @@ -# This script will see how many documents in ELink, i.e. ones with a DOI, are not accounted for in the internal DOI collection. - -from elinkapi import Elink, Query, Record - -import os -from dotenv import load_dotenv - -load_dotenv() # depends on the root directory from which you run your python scripts. - -api = Elink(token = os.environ.get("elink_api_PRODUCTION_key")) - - -query1 = api.query_records(rows=1000) - -materials_with_dois : list[Record] = [] - -for page in query1: - print(f"Now on Page: {page.title}") - print(f"Material_ID: {page.site_unique_id} and DOI: http://doi.org/{page.doi}") - - if page.site_unique_id.startswith("mp-"): - materials_with_dois.append(page) - - # for record in page.data: - # if record.site_unique_id.startswith("mp-"): - # materials_with_dois.append(record) - - - -# set_q1 = [page for page in query1] -# set_q2 = [page for page in query2] - -# set_diffq1q2 = set(set_q1) - set(set_q2) -# print (f"Difference matched {len(set)} records") - -# filtered = [ -# page for page in query1 -# if page.title.lower().startswith("materials data on") -# ] - -# print (f"Filtered Query1 has {len(filtered)} records") - -# paginate through ALL results -# for page in query1: -# print(page.title) -# print(f"Material_ID: {page.site_unique_id} and DOI: http://doi.org/{page.doi}") - -# for record in page.data: -# print (f"OSTI ID: {record.osti_id} Title: {record.title}") \ No newline at end of file diff --git a/tests/outputs.txt b/tests/outputs.txt deleted file mode 100644 index 8d188e7..0000000 --- a/tests/outputs.txt +++ /dev/null @@ -1,46 +0,0 @@ -(mpcite-env) C:\Users\ongha\OneDrive\Documents\GitHub\MPCite>C:/Users/ongha/anaconda3/envs/mpcite-env/python.exe c:/Users/ongha/OneDrive/Documents/GitHub/MPCite/tests/prod_to_review.py - -Query retrieved 144845 record(s) -Page finished. Now at 500 data entries. 0 edge cases found. -Page finished. Now at 1000 data entries. 0 edge cases found. -Page finished. Now at 1500 data entries. 0 edge cases found. -Page finished. Now at 2000 data entries. 0 edge cases found. -Page finished. Now at 2500 data entries. 0 edge cases found. -Page finished. Now at 3000 data entries. 0 edge cases found. -Page finished. Now at 3500 data entries. 0 edge cases found. -Page finished. Now at 4000 data entries. 0 edge cases found. -Page finished. Now at 4500 data entries. 0 edge cases found. -Page finished. Now at 5000 data entries. 0 edge cases found. -Page finished. Now at 5500 data entries. 0 edge cases found. -Page finished. Now at 6000 data entries. 0 edge cases found. -Page finished. Now at 6500 data entries. 0 edge cases found. -Page finished. Now at 7000 data entries. 0 edge cases found. -Page finished. Now at 7500 data entries. 0 edge cases found. -Page finished. Now at 8000 data entries. 0 edge cases found. -Page finished. Now at 8500 data entries. 0 edge cases found. -Page finished. Now at 9000 data entries. 0 edge cases found. -Page finished. Now at 9500 data entries. 0 edge cases found. -Page finished. Now at 10000 data entries. 0 edge cases found. -Page finished. Now at 10500 data entries. 0 edge cases found. -Page finished. Now at 11000 data entries. 0 edge cases found. -Page finished. Now at 11500 data entries. 0 edge cases found. -Page finished. Now at 12000 data entries. 0 edge cases found. -Page finished. Now at 12500 data entries. 0 edge cases found. -Page finished. Now at 13000 data entries. 0 edge cases found. -Page finished. Now at 13500 data entries. 0 edge cases found. -Page finished. Now at 14000 data entries. 0 edge cases found. -Page finished. Now at 14500 data entries. 0 edge cases found. - -Traceback (most recent call last): - File "C:\Users\ongha\anaconda3\envs\mpcite-env\Lib\site-packages\elinkapi\query.py", line 95, in __next__ - record = self.data.pop() -IndexError: pop from empty list - -During handling of the above exception, another exception occurred: - -Traceback (most recent call last): - File "c:\Users\ongha\OneDrive\Documents\GitHub\MPCite\tests\prod_to_review.py", line 29, in - record = next(query) - File "C:\Users\ongha\anaconda3\envs\mpcite-env\Lib\site-packages\elinkapi\query.py", line 108, in __next__ - raise StopIteration -StopIteration \ No newline at end of file diff --git a/tests/prod_to_review.py b/tests/prod_to_review.py deleted file mode 100644 index 87e311d..0000000 --- a/tests/prod_to_review.py +++ /dev/null @@ -1,120 +0,0 @@ -from elinkapi import Elink, Query, Record - -import os -from dotenv import load_dotenv - -import json - -load_dotenv() # depends on the root directory from which you run your python scripts. - -review_endpoint = "https://review.osti.gov/elink2api/" - -prod_api = Elink(token = os.environ.get("elink_api_PRODUCTION_key")) -review_api = Elink(token = os.environ.get("elink_review_api_token"), target=review_endpoint) - -print(prod_api.query_records()) - -rows_per_page = 100 - -# query production -query = prod_api.query_records(rows=rows_per_page) -print(f"Query retrieved {query.total_rows} record(s)") - -count_materials_data = 0 -count_MaterialsDataOn = 0 -cwd = os.getcwd() -page_number = 0 -page_json_list = [] - -for record in query: - # increment counter - count_materials_data = count_materials_data + 1 - print(f"On record #{count_materials_data}, next url is {query.next_url}, previous url is {query.previous_url}") - - # see if the record is a Materials Data on record - if record.title.startswith("Materials Data on"): - # increment the MaterialsDataOn counter - count_MaterialsDataOn = count_MaterialsDataOn + 1 - - # prepare the new record for the review environment, remove the OSTI ID, and add its model_dump to the list of json objects for the page. - new_record = record - new_record_dict = new_record.model_dump(exclude_none=True) - - new_record_osti_id = new_record_dict.pop("osti_id") # now new_record_dict does not have the osti_id key. - js = json.dumps(new_record_dict, default=str) # datetime objects are not JSON serializable, so we use default=str to convert them to strings. - - page_json_list.append(js) - - # TODO: take the new_record_dict and make it into a new post to the review environment and save the RecordResponse. - - else: - print(f"Found edge case: {record.title}") - - if count_materials_data % rows_per_page == 0: - # create/open, write, and close new json file - page_number = count_materials_data / rows_per_page - path = f'/json_pages/page_number_{page_number}' - fp = open(cwd+path, 'a') - - for js in page_json_list: - fp.write(js) - fp.write("\n") - - fp.close() - page_json_list = [] - - print(f"Page {page_number} finished. Now at {count_materials_data} data entries. {count_materials_data - count_MaterialsDataOn} edge cases found.") - -# print remainder of records if not a full page after for loop exits -page_number = page_number + 1 -path = f'/json_pages/page_number_{page_number}' -fp = open(cwd+path, 'a') -for js in page_json_list: - fp.write(js) - fp.write("\n") -fp.close() - -# # if contains materials data on, then add to batch -# for count_materials_data < query.total_rows: - -# # print(f"The length of the query is now {len(query.data)}") -# record = next(query) -# count_materials_data = count_materials_data + 1 - -# if record.title.startswith("Materials Data on"): -# count_MaterialsDataOn = count_MaterialsDataOn + 1 - -# new_record = record -# new_record_dict = new_record.model_dump(exclude_none=True) - -# new_record_osti_id = new_record_dict.pop("osti_id") - -# page_dict[f"Entry OSTI_ID {new_record_osti_id}"] = new_record_dict - -# # TODO: take the new_record_dict and make it into a new post to the review environment and save the RecordResponse. - - - -# if count_materials_data % rows_per_page == 0: -# # if a page has been fully consummed, then print the new batched dictionary to a json file. - -# js = json.dumps(page_dict, default=str) - -# # open new json file if not exist it will create -# cwd = os.getcwd() -# path = f'/json_pages/page_number_{count_materials_data/rows_per_page}' -# fp = open(cwd+path, 'a') - -# # write to json file -# fp.write(js) - -# # close the connection to the file and empty the dict -# fp.close() -# page_dict = {} - -# print(f"Page {(count_materials_data / rows_per_page)} finished. Now at {count_materials_data} data entries. {count_materials_data - count_MaterialsDataOn} edge cases found.") - -# model_dump exclude_none=True, remove null keys -# pop osti_id --> save batch to json files -# make new record -# post to review_api diff --git a/tests/test_elink_api.py b/tests/test_elink_api.py deleted file mode 100644 index 80afba7..0000000 --- a/tests/test_elink_api.py +++ /dev/null @@ -1,109 +0,0 @@ -import os -from dotenv import load_dotenv - -from elinkapi import Elink, Record, exceptions -import pytest -from mpcite.models import ELinkGetResponseModel, TestClass - -from pymongo import MongoClient -import pymongo - -load_dotenv() - -atlas_user = os.environ.get("atlas_user") -atlas_password = os.environ.get("atlas_password") -atlas_host = os.environ.get("atlas_host") -mongo_uri = f"mongodb+srv://{atlas_user}:{atlas_password}@{atlas_host}/" - -api = Elink(token=os.environ.get("elink_api_PRODUCTION_key")) # target default is production E-link service. - -### Grabbing an existing record - -# record = api.get_single_record(mp-id) # test for silicon - -# type(record) - -# ELinkGotRecordModel = ELinkGetResponseModel.from_elinkapi_record(record) - -# print(ELinkGotRecordModel.get_title()) -# print(ELinkGotRecordModel.get_site_url()) -# print(ELinkGotRecordModel.get_keywords()) -# print(ELinkGotRecordModel.get_default_description()) - -# ELinkTestGetRecordModel = TestClass(**record.model_dump()) - -### Making a new record - -# with MongoClient(mongo_uri) as client: -# #get all material_ids and dois from doi collection -# doi_collection = client["mp_core"]["dois"] -# materials_to_update = list(doi_collection.find({}, {"_id": 0, "material_id": 1, "doi": 1}, limit=10)) -# material_ids = [entry["material_id"] for entry in materials_to_update] - -# # check # of material_ids from DOI collection vs amount in robocrys - -# # get description for material_ids from robocrys collection -# coll = client["mp_core_blue"]["robocrys"] -# res = list(coll.find({"material_id": {"$in": material_ids}}, {"_id": 0, "material_id": 1, "description": 1})) - -# # join on material_id -# for doc in res: -# mat = next(filter(lambda x: x["material_id"] == doc["material_id"], materials_to_update)) -# doc["doi"] = mat["doi"] - - -# {"material_id": ..., "doi": ..., "description": ...} -> -# Record( -# template_fields ..., -# doi: ..., -# description: ..., -# fields_where_material_id_makes_sense: ..., -# ) - -# with the client open -with MongoClient(mongo_uri) as client: - # get all dois from the collection - doi_collection = client["mp_core"]["dois"] - materials_to_update = list(doi_collection.find({}, {"_id": 0, "doi": 1, "material_id": 1}, limit=2)) - - # from the doi collection, grab the material_id and doi of each material - material_ids = [entry["material_id"] for entry in materials_to_update] - - # additionally, gain the osti id from the doi - osti_ids = [entry["doi"].split("10.17188/")[1] for entry in materials_to_update] - - # additionally, grab the description of each material from the robocrys - coll = client["mp_core_blue"]["robocrys"] # grabs robocrys collection from active database - res = list(coll.find({"material_id": {"$in": material_ids}}, {"_id": 0, "material_id": 1, "description": 1})) # grabs the material id and description of entries in the collection - descriptions = [entry["description"] for entry in res] - - # for each material (and its material_id, doi, and osti_id) - for i in range(len(materials_to_update)): - internal_material_id = material_ids[i] - internal_osti_id = osti_ids[i] - internal_description = descriptions[i] - - # get_single_record(osti_id) - record = api.get_single_record(internal_osti_id) - - print(f"\n \n \nPrinting what is currently on ELINK for {internal_material_id}*****************************************") - print(record) - - if internal_material_id == record.site_unique_id: - # update description - record.description = "testTESTtestTESTtest" - - print(f"\n \n \nPrinting record for {internal_material_id}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") - print(record) - - # # post updated record - # try: - # saved_record = api.post_new_record(record, "save") - # except exceptions.BadRequestException as ve: - # ... - # # ve.message = "Site Code AAAA is not valid." - # # ve.errors provides more details: - # # [{"status":"400", "detail":"Site Code AAAA is not valid.", "source":{"pointer":"site_ownership_code"}}] - - - From 8191ffce6dbcfc9d1562aac1704683400dd71a5c Mon Sep 17 00:00:00 2001 From: HugoOnghai Date: Wed, 6 Aug 2025 17:10:25 -0700 Subject: [PATCH 23/26] improved DOIModel to accept RecordResponse splat, and cleaned up comments on core.py --- src/mp_cite/core.py | 27 +++++++++++++-------------- src/mp_cite/doi_builder.py | 36 +++++++++++++++--------------------- 2 files changed, 28 insertions(+), 35 deletions(-) diff --git a/src/mp_cite/core.py b/src/mp_cite/core.py index 5e9c5bc..9fe7938 100644 --- a/src/mp_cite/core.py +++ b/src/mp_cite/core.py @@ -78,6 +78,19 @@ def update_existing_osti_record( :elinkapi is the instance of the elinkapi associated with the environment in which the record is held (e.g. either production or review environment) :osti_id is the osti id of the record which ought to be updated :new_values is a dictionary of keywords (which should exist in ELink's record model) and new value pairs. + + N.B., it is currently assumed that the user will handle the "sponsor identifier bug" + --- in which the retreived record responses of validated records from the E-Link production environment seemingly + lack the required Sponsor Organization identifiers which were necessary for their submission (due to rearrangement of metadata + on E-Link's side) --- before calling this function. + + Otherwise, the following code excerpt would need to be added to retroactively fix the issue with the sponsor organization's identifiers + for entry in record.organizations: + if entry.type == "SPONSOR": + entry.identifiers = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] + break + + Instead, we leave this for the user. """ record_on_elink = elinkapi.get_single_record(osti_id) @@ -85,13 +98,6 @@ def update_existing_osti_record( for keyword in new_values: setattr(record_on_elink, keyword, new_values[keyword]) - # assume the use with fix the sponsor identifier bug before calling the update function - # # fix the issue with the sponsor organization's identifiers - # for entry in record_on_elink.organizations: - # if entry.type == "SPONSOR": - # entry.identifiers = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] - # break - return elinkapi.update_record( osti_id, record_on_elink, state="save" ) # user should use update_state_of_osti_record to submit instead @@ -136,13 +142,6 @@ def update_state_of_osti_record( returns the record response after updating the state. """ - # assuming that the user will handle the sponsor identifier bug before calling this function - # # fix the issue with the sponsor organization's identifiers - # for entry in record.organizations: - # if entry.type == "SPONSOR": - # entry.identifiers = [{"type": 'CN_DOE', "value": 'AC02-05CH11231'}] - # break - return elinkapi.update_record(osti_id, record, new_state) diff --git a/src/mp_cite/doi_builder.py b/src/mp_cite/doi_builder.py index 04bf257..57601fb 100644 --- a/src/mp_cite/doi_builder.py +++ b/src/mp_cite/doi_builder.py @@ -1,4 +1,5 @@ -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, model_validator +from typing import Any from datetime import datetime @@ -10,11 +11,12 @@ class DOIModel(BaseModel): title: str = Field( description="The title of the record" ) # can be taken from ELink API - osti_id: str = Field( + osti_id: int = Field( coerce_numbers_to_str=True, description="The OSTI ID number allocated by OSTI to make the DOI number", ) # can be taken from ELink API material_id: str # can be taken from Robocrys Collection or ELink API + site_unique_id: str # added for compatability with E-Link's record model but in practice will match material_id # time stamps date_metadata_added: datetime | None = Field( @@ -37,24 +39,16 @@ class DOIModel(BaseModel): description="" ) # labelled as publication_date in RecordResponse of ELink API + @model_validator(mode="before") + def set_material_id(cls, values: dict[str, Any]): + """ + set_material_id will take the values passed into the model constructor before full instantiation of the object and pydantic parcing + and make it that the whatever is passed in for the unique_site_id will match whatever is passed in for material_id -# hypothetically post an update or submit a new record and receive the RecordResponse -def record_response_to_DOI_model(recordresponse): - """ - turns a recordresponse, which is returned from a save, submission, post, etc. into a doi_model object - """ - params = { - "doi": recordresponse.doi, - "title": recordresponse.title, - "osti_id": str(recordresponse.osti_id), - "material_id": recordresponse.site_unique_id, - "date_metadata_added": recordresponse.date_metadata_added, - "date_metadata_updated": recordresponse.date_metadata_updated, - "workflow_status": recordresponse.workflow_status, - "date_released": recordresponse.date_released, - "date_submitted_to_osti_first": recordresponse.date_submitted_to_osti_first, # date record was first submitted to OSTI for publication, maintained internally by E-Link - "date_submitted_to_osti_last": recordresponse.date_submitted_to_osti_last, # most recent date record information was submitted to OSTI. Maintained internally by E-Link. - "publication_date": recordresponse.publication_date, - } + :cls to designate it as a class method + :values are the values passed into the constructor (contain the "raw input") - return DOIModel(**params) + returns the values so that instantiation can proceed. + """ + values["material_id"] = values["site_unique_id"] + return values From 99d1ca97226775bbe8e927fea30bdf371ff5a15b Mon Sep 17 00:00:00 2001 From: HugoOnghai Date: Wed, 6 Aug 2025 17:24:45 -0700 Subject: [PATCH 24/26] Addressed more comments, removed oldreqs, renamed doi_builder, and moved DAMinimum model to models.py --- oldrequirements.txt | 8 ---- src/mp_cite/core.py | 38 ++++------------- src/mp_cite/doi_builder.py | 54 ------------------------ src/mp_cite/models.py | 86 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 93 insertions(+), 93 deletions(-) delete mode 100644 oldrequirements.txt delete mode 100644 src/mp_cite/doi_builder.py create mode 100644 src/mp_cite/models.py diff --git a/oldrequirements.txt b/oldrequirements.txt deleted file mode 100644 index 495f0a2..0000000 --- a/oldrequirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -xmltodict==0.12.0 -urllib3==1.25.9 -dicttoxml==1.7.4 -tqdm==4.46.0 -bibtexparser==1.2.0 -monty==3.0.2 -nbformat==5.0.7 -nbconvert==5.6.1 diff --git a/src/mp_cite/core.py b/src/mp_cite/core.py index 9fe7938..4b5c28b 100644 --- a/src/mp_cite/core.py +++ b/src/mp_cite/core.py @@ -1,17 +1,16 @@ from typing import TypeAlias from elinkapi import Elink -from elinkapi.record import RecordResponse, Record, Organization, Person +from elinkapi.record import RecordResponse from pymongo import MongoClient import requests from elinkapi.utils import Validation -from datetime import datetime -import pytz -from pydantic import Field -from typing import List, Literal +from mp_cite.doi_builder import MinimumDARecord + +from typing import Literal OstiID: TypeAlias = int @@ -106,14 +105,15 @@ def update_existing_osti_record( def submit_new_osti_record( elinkapi: Elink, new_values: dict, - state="submit", # assuming there is no need to both with saving. just send new record to osti when its ready for submission. also assume bug with DOE contract number identifier in sponsor organization is accounted for + state="submit", ) -> RecordResponse: """ submit_new_osti_record generates a new record based on the provided keyword-value pairs in the new_values dict and the default minimum DA Record metadata necessary for submission :elinkapi is the elinkapi (see previous) :new_values is the dictionary of keywords and values which want to be included in the submitted record (besides or in lieu of default values). The title MUST be provided. - :state defaults to "submit" but the user can simply "save" if desired. + :state defaults to "submit" but the user can simply "save" if desired. This is done given our assumption that there is + no need to both with saving, rather, just only send new record to osti when it's ready for submission. returns the record response after submission """ @@ -161,27 +161,3 @@ def delete_osti_record(elinkapi: Elink, osti_id: OstiID, reason: str) -> bool: ) Validation.handle_response(response) return response.status_code == 204 # True if deleted successfully - - -class MinimumDARecord(Record): - product_type: str = Field(default="DA") - title: str # Required - organizations: List[Organization] = Field( - default_factory=lambda: [ - Organization(type="RESEARCHING", name="LBNL Materials Project (LBNL-MP)"), - Organization( - type="SPONSOR", - name="TEST SPONSOR ORG", - identifiers=[{"type": "CN_DOE", "value": "AC02-05CH11231"}], - ), # sponsor org is necessary for submission - ] - ) - persons: List[Person] = Field( - default_factory=lambda: [Person(type="AUTHOR", last_name="Persson")] - ) - site_ownership_code: str = Field(default="LBNL-MP") - access_limitations: List[str] = Field(default_factory=lambda: ["UNL"]) - publication_date: datetime = Field( - default_factory=lambda: datetime.now(tz=pytz.UTC) - ) - site_url: str = Field(default="https://next-gen.materialsproject.org/materials") diff --git a/src/mp_cite/doi_builder.py b/src/mp_cite/doi_builder.py deleted file mode 100644 index 57601fb..0000000 --- a/src/mp_cite/doi_builder.py +++ /dev/null @@ -1,54 +0,0 @@ -from pydantic import BaseModel, Field, model_validator -from typing import Any -from datetime import datetime - - -class DOIModel(BaseModel): - # identifiers - doi: str = Field( - description="The DOI number as allocated by OSTI" - ) # can be taken from ELink API - title: str = Field( - description="The title of the record" - ) # can be taken from ELink API - osti_id: int = Field( - coerce_numbers_to_str=True, - description="The OSTI ID number allocated by OSTI to make the DOI number", - ) # can be taken from ELink API - material_id: str # can be taken from Robocrys Collection or ELink API - site_unique_id: str # added for compatability with E-Link's record model but in practice will match material_id - - # time stamps - date_metadata_added: datetime | None = Field( - description="date_record_entered_onto_ELink" - ) # can be taken from ELink API response - date_metadata_updated: datetime | None = Field( - description="date_record_last_updated_on_Elink" - ) - - # status - workflow_status: str # can be taken from ELink API - date_released: datetime | None = Field(description="") - date_submitted_to_osti_first: datetime = Field( - description="date record was first submitted to OSTI for publication, maintained internally by E-Link" - ) - date_submitted_to_osti_last: datetime = Field( - description="most recent date record information was submitted to OSTI. Maintained internally by E-Link" - ) - publication_date: datetime | None = Field( - description="" - ) # labelled as publication_date in RecordResponse of ELink API - - @model_validator(mode="before") - def set_material_id(cls, values: dict[str, Any]): - """ - set_material_id will take the values passed into the model constructor before full instantiation of the object and pydantic parcing - and make it that the whatever is passed in for the unique_site_id will match whatever is passed in for material_id - - :cls to designate it as a class method - :values are the values passed into the constructor (contain the "raw input") - - returns the values so that instantiation can proceed. - """ - values["material_id"] = values["site_unique_id"] - return values diff --git a/src/mp_cite/models.py b/src/mp_cite/models.py new file mode 100644 index 0000000..e649ff2 --- /dev/null +++ b/src/mp_cite/models.py @@ -0,0 +1,86 @@ +from pydantic import BaseModel, Field, model_validator + +from datetime import datetime +from elinkapi import Record, Organization, Person + +from typing import List, Any +import pytz + + +class DOIModel(BaseModel): + """ + The model for a DOI document in a mongodb collection, which should better align with E-Link's record model. + + It is designed for easy transfer from E-Link record response to doi document. All fields can be mapped directly from a + record response keywords of the same name, or, in the case of material_id, it is automatically filled in with site_unique_id + with the model validator `set_material_id(...)` + """ + + # identifiers + doi: str = Field(description="The DOI number as allocated by OSTI") + title: str = Field(description="The title of the record") + osti_id: int = Field( + coerce_numbers_to_str=True, + description="The OSTI ID number allocated by OSTI to make the DOI number", + ) + material_id: str + site_unique_id: str + + # time stamps + date_metadata_added: datetime | None = Field( + description="date_record_entered_onto_ELink" + ) + date_metadata_updated: datetime | None = Field( + description="date_record_last_updated_on_Elink" + ) + + # status + workflow_status: str + date_released: datetime | None = Field(description="") + date_submitted_to_osti_first: datetime = Field( + description="date record was first submitted to OSTI for publication, maintained internally by E-Link" + ) + date_submitted_to_osti_last: datetime = Field( + description="most recent date record information was submitted to OSTI. Maintained internally by E-Link" + ) + publication_date: datetime | None = Field(description="") + + @model_validator(mode="before") + def set_material_id(cls, values: dict[str, Any]): + """ + set_material_id will take the values passed into the model constructor before full instantiation of the object and pydantic parcing + and make it that the whatever is passed in for the unique_site_id will match whatever is passed in for material_id + + :cls to designate it as a class method + :values are the values passed into the constructor (contain the "raw input") + + returns the values so that instantiation can proceed. + """ + values["material_id"] = values["site_unique_id"] + return values + + +class MinimumDARecord(Record): + """ """ + + product_type: str = Field(default="DA") + title: str # Required + organizations: List[Organization] = Field( + default_factory=lambda: [ + Organization(type="RESEARCHING", name="LBNL Materials Project (LBNL-MP)"), + Organization( + type="SPONSOR", + name="TEST SPONSOR ORG", + identifiers=[{"type": "CN_DOE", "value": "AC02-05CH11231"}], + ), # sponsor org is necessary for submission + ] + ) + persons: List[Person] = Field( + default_factory=lambda: [Person(type="AUTHOR", last_name="Persson")] + ) + site_ownership_code: str = Field(default="LBNL-MP") + access_limitations: List[str] = Field(default_factory=lambda: ["UNL"]) + publication_date: datetime = Field( + default_factory=lambda: datetime.now(tz=pytz.UTC) + ) + site_url: str = Field(default="https://next-gen.materialsproject.org/materials") From 3ef140558d4d0fa5fdc31cb79497bb574e41ace2 Mon Sep 17 00:00:00 2001 From: Tyler Mathis <35553152+tsmathis@users.noreply.github.com> Date: Wed, 6 Aug 2025 17:47:32 -0700 Subject: [PATCH 25/26] move docstring to beginning of function --- src/mp_cite/core.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mp_cite/core.py b/src/mp_cite/core.py index 4b5c28b..0e4d698 100644 --- a/src/mp_cite/core.py +++ b/src/mp_cite/core.py @@ -131,7 +131,6 @@ def submit_new_osti_record( def update_state_of_osti_record( elinkapi: Elink, osti_id: OstiID, new_state: Literal["save", "submit"] ) -> RecordResponse: - record = elinkapi.get_single_record(osti_id) """ update_state_of_osti_record allows a user to update the state of a record with provided osti_id to either "save" or "submit" (the two valid states) @@ -141,7 +140,7 @@ def update_state_of_osti_record( returns the record response after updating the state. """ - + record = elinkapi.get_single_record(osti_id) return elinkapi.update_record(osti_id, record, new_state) From 5f21e4610830aa149c1a0e4ce2f70da38f933420 Mon Sep 17 00:00:00 2001 From: Tyler Mathis <35553152+tsmathis@users.noreply.github.com> Date: Wed, 6 Aug 2025 17:48:11 -0700 Subject: [PATCH 26/26] remove empty docstring --- src/mp_cite/models.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/mp_cite/models.py b/src/mp_cite/models.py index e649ff2..8bbaf74 100644 --- a/src/mp_cite/models.py +++ b/src/mp_cite/models.py @@ -61,8 +61,6 @@ def set_material_id(cls, values: dict[str, Any]): class MinimumDARecord(Record): - """ """ - product_type: str = Field(default="DA") title: str # Required organizations: List[Organization] = Field(