From 4e14c375a6b3072be719729c32af52f9e44f23b8 Mon Sep 17 00:00:00 2001 From: Andrew Crozier Date: Wed, 22 Jan 2020 13:03:46 +0000 Subject: [PATCH 1/3] Add module to make datasets IO easier with pandas --- faculty/datasets/pandas.py | 77 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 faculty/datasets/pandas.py diff --git a/faculty/datasets/pandas.py b/faculty/datasets/pandas.py new file mode 100644 index 00000000..4dcd527a --- /dev/null +++ b/faculty/datasets/pandas.py @@ -0,0 +1,77 @@ +# Copyright 2018-2019 Faculty Science Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas + +from faculty.session import get_session +from faculty.context import get_context +from faculty.clients.object import ObjectClient +from faculty.datasets import transfer + + +def read_csv(project_path, project_id=None, *args, **kwargs): + """Read a CSV file from a project's datasets as a pandas DataFrame. + + Requires pandas to be installed. + + Parameters + ---------- + project_path : str + The path of the CSV file in the project's datasets to read. + project_id : str, optional + The project to get the CSV file from. You need to have access to this + project for it to work. Defaults to the project set by + FACULTY_PROJECT_ID in your environment. + *args + Additional positional arguments to pass to ``pandas.read_csv``. + **kwargs + Additional keyword arguments to pass to ``pandas.read_csv``. + """ + + project_id = project_id or get_context().project_id + object_client = ObjectClient(get_session()) + presigned_url = object_client.presign_download(project_id, project_path) + + return pandas.read_csv(presigned_url, *args, **kwargs) + + +def to_csv(dataframe, project_path, project_id=None, *args, **kwargs): + """Write a pandas DataFrame to a CSV file in a project's datasets. + + Requires pandas to be installed. + + Parameters + ---------- + dataframe : pandas.DataFrame + The DataFrame to write. + project_path : str + The path in the project's datasets to write to. + project_id : str, optional + The project to write the CSV file to. You need to have access to this + project for it to work. Defaults to the project set by + FACULTY_PROJECT_ID in your environment. + *args + Additional positional arguments to pass to ``pandas.DataFrame.to_csv``. + **kwargs + Additional keyword arguments to pass to ``pandas.DataFrame.to_csv``. + """ + + project_id = project_id or get_context().project_id + object_client = ObjectClient(get_session()) + + content = dataframe.to_csv(path_or_buf=None, *args, **kwargs) + + transfer.upload( + object_client, project_id, project_path, content.encode("utf-8") + ) From 0691b3f8429f76d452cfa55731df5eca11416252 Mon Sep 17 00:00:00 2001 From: Andrew Crozier Date: Wed, 23 Jun 2021 10:32:17 +0100 Subject: [PATCH 2/3] Update copyright notice --- faculty/datasets/pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faculty/datasets/pandas.py b/faculty/datasets/pandas.py index 4dcd527a..2c88c48a 100644 --- a/faculty/datasets/pandas.py +++ b/faculty/datasets/pandas.py @@ -1,4 +1,4 @@ -# Copyright 2018-2019 Faculty Science Limited +# Copyright 2018-2021 Faculty Science Limited # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 0ef85aaafa9ff80a1a416e46494356d090c1e453 Mon Sep 17 00:00:00 2001 From: Andrew Crozier Date: Wed, 23 Jun 2021 12:39:19 +0100 Subject: [PATCH 3/3] Update to latest client APIs --- faculty/datasets/pandas.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/faculty/datasets/pandas.py b/faculty/datasets/pandas.py index 2c88c48a..07a4f6eb 100644 --- a/faculty/datasets/pandas.py +++ b/faculty/datasets/pandas.py @@ -40,7 +40,11 @@ def read_csv(project_path, project_id=None, *args, **kwargs): """ project_id = project_id or get_context().project_id - object_client = ObjectClient(get_session()) + + session = get_session() + url = session.service_url(ObjectClient.SERVICE_NAME) + object_client = ObjectClient(url, session) + presigned_url = object_client.presign_download(project_id, project_path) return pandas.read_csv(presigned_url, *args, **kwargs) @@ -68,7 +72,10 @@ def to_csv(dataframe, project_path, project_id=None, *args, **kwargs): """ project_id = project_id or get_context().project_id - object_client = ObjectClient(get_session()) + + session = get_session() + url = session.service_url(ObjectClient.SERVICE_NAME) + object_client = ObjectClient(url, session) content = dataframe.to_csv(path_or_buf=None, *args, **kwargs)