diff --git a/dx-toolkit/dx_extract_dataset_python.ipynb b/dx-toolkit/dx_extract_dataset_python.ipynb index 158ebf6..ac65db9 100644 --- a/dx-toolkit/dx_extract_dataset_python.ipynb +++ b/dx-toolkit/dx_extract_dataset_python.ipynb @@ -1 +1 @@ -{"metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5"}}, "nbformat_minor": 5, "nbformat": 4, "cells": [{"cell_type": "markdown", "source": "# \u201cdx extract_dataset\u201d in Python\n
\n***As-Is Software Disclaimer***\n\nThis content in this repository is delivered \u201cAs-Is\u201d. Notwithstanding anything to the contrary, DNAnexus will have no warranty, support, liability or other obligations with respect to Materials provided hereunder.\n\n
\n\nThis notebook demonstrates usage of the dx command `extract_dataset` for:\n* Retrieval of Apollo-stored data, as referenced within entities and fields of a Dataset or Cohort object on the platform\n* Retrieval of the underlying data dictionary files used to generate a Dataset object on the platform\n\nMIT License applies to this notebook.", "metadata": {}}, {"cell_type": "markdown", "source": "## Preparing your environment\n### Launch spec:\n\n* App name: JupyterLab with Python, R, Stata, ML ()\n* Kernel: Python\n* Instance type: mem1_ssd1_v2_x2\n* Cost: < $0.2\n* Runtime: =~ 10 min\n* Data description: Input for this notebook is a v3.0 Dataset or Cohort object ID", "metadata": {}}, {"cell_type": "markdown", "source": "### dxpy version\nextract_dataset requires dxpy version >= 0.329.0. If running the command from your local environment (i.e. off of the DNAnexus platform), it may be required to also install pandas. For example, pip3 install -U dxpy[pandas]", "metadata": {}}, {"cell_type": "code", "source": "import subprocess\nimport dxpy\nimport pandas as pd\nimport os\nimport glob\npd.set_option('display.max_columns', None)", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "code", "source": "dxpy.__version__", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "markdown", "source": "### 1. Assign environment variables", "metadata": {}}, {"cell_type": "code", "source": "# The referenced Dataset is private and provided only to demonstrate an example input. The user will need to supply a permissible and valid record-id\n# Assign project-id of dataset\npid = 'project-G5BzYk80kP5bvbXy5J7PQZ36'\n# Assign dataset record-id\nrid = 'record-GJ3Y7jQ0VKyy592yPxB4yG7Y'\n# Assign joint dataset project-id:record-id\ndataset = (':').join([pid, rid])", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "markdown", "source": "### 2. Call \u201cdx extract_dataset\u201d using a supplied dataset", "metadata": {}}, {"cell_type": "code", "source": "cmd = [\"dx\", \"extract_dataset\", dataset, \"-ddd\", \"--delimiter\", \",\"]\nsubprocess.check_call(cmd)", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "markdown", "source": "#### Preview data in the three dictionary (*.csv) files", "metadata": {}}, {"cell_type": "code", "source": "path = os.getcwd()", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "code", "source": "data_dict_csv = glob.glob(os.path.join(path, \"*.data_dictionary.csv\"))[0]\ndata_dict_df = pd.read_csv(data_dict_csv)\ndata_dict_df.head()", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "code", "source": "codings_csv = glob.glob(os.path.join(path, \"*.codings.csv\"))[0]\ncodings_df = pd.read_csv(codings_csv)\ncodings_df.head()", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "code", "source": "entity_dict_csv = glob.glob(os.path.join(path, \"*.entity_dictionary.csv\"))[0]\nentity_dict_df = pd.read_csv(entity_dict_csv)\nentity_dict_df.head()", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "markdown", "source": "### 3. Parse returned metadata and extract entity/field names", "metadata": {}}, {"cell_type": "code", "source": "data_dict_df['ent_field'] = data_dict_df['entity'].astype(str) + \\\n '.' + data_dict_df['name'].astype(str)\n \nentity_field = data_dict_df.ent_field.values.tolist()\nentity_field[:10]", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "markdown", "source": "### 4. Use extracted entity and field names as input to the called function, \u201cdx extract_dataset\u201d and extract data", "metadata": {}}, {"cell_type": "code", "source": "cmd = [\"dx\", \"extract_dataset\", dataset, \"--fields\", ','.join(entity_field), \n \"-o\", \"extracted_data.csv\"]\nsubprocess.check_call(cmd)", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "markdown", "source": "#### Print data in the retrieved data file", "metadata": {}}, {"cell_type": "code", "source": "fields_file_df = pd.read_csv(\"extracted_data.csv\", float_precision='round_trip')\nfields_file_df.head()", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "markdown", "source": "### 5. Replace any coded column values of extracted data with the coded meaning", "metadata": {}}, {"cell_type": "code", "source": "data_dict_df['coding_value_type'] = data_dict_df['is_multi_select'].apply(\n lambda x: \"list\" if x == \"yes\" else \"string\")", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "code", "source": "fields_file_df_decoded = fields_file_df.copy(deep=True)\n\ndef get_meaning(code_name, code):\n if isinstance(code, int):\n code = str(code)\n elif isinstance(code, float):\n code = str(code)\n # If field type is float, and an integer sparse code is used for the field \n # (example `1`), the retrieved data represents it as a float (`1.0`).\n # Strip the `.0` suffix and search for the code in codings dataframe\n if codings_df.loc[(codings_df[\"coding_name\"]== code_name) & \n (codings_df[\"code\"]== code), \"meaning\"].empty:\n if code.endswith('.0'):\n code = code[:-2]\n return(codings_df.loc[(codings_df[\"coding_name\"]== code_name) & \n (codings_df[\"code\"]== code), \"meaning\"])\n\nfor (columnName, columnData) in fields_file_df_decoded.iteritems():\n code_name, data_type= data_dict_df[(data_dict_df[\"ent_field\"]==columnName)][\n [\"coding_name\", \"coding_value_type\"]].values[0]\n if not pd.isna(code_name):\n set_of_values = set(columnData.dropna())\n for val in set_of_values:\n if data_type == \"list\":\n new_val = []\n list_val = eval(val)\n for i in list_val:\n meaning = get_meaning(code_name, i)\n if not meaning.empty:\n new_val.append(meaning.values.item())\n else:\n new_val.append(i)\n fields_file_df_decoded.loc[fields_file_df_decoded[columnName] == val, \n columnName] = str(new_val)\n continue\n elif data_type == \"string\":\n meaning = get_meaning(code_name, val)\n if not meaning.empty:\n fields_file_df_decoded.loc[fields_file_df_decoded[columnName] == val, \n columnName] = meaning.values.item()\nfields_file_df_decoded.head()", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "code", "source": "fields_file_df_decoded.to_csv(\"extracted_data_with_code_meanings.csv\", index=False)", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "markdown", "source": "### 6. Drop sparsely coded values", "metadata": {}}, {"cell_type": "code", "source": "fields_sparse_code = fields_file_df.copy(deep=True)", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "code", "source": "for (columnName, columnData) in fields_sparse_code.iteritems():\n code_name, data_type, is_sparse_coding= data_dict_df[\n (data_dict_df[\"ent_field\"]==columnName)][\n [\"coding_name\", \"coding_value_type\", \"is_sparse_coding\"]].values[0]\n if not (pd.isna(code_name) and pd.isna(is_sparse_coding)) and \\\n is_sparse_coding=='yes':\n set_of_values = set(columnData.dropna())\n for val in set_of_values:\n if data_type == \"list\":\n new_val = []\n list_val = eval(val)\n for i in list_val:\n meaning = get_meaning(code_name, i)\n if meaning.empty:\n new_val.append(i)\n fields_sparse_code.loc[fields_sparse_code[columnName] == val, \n columnName] = str(new_val)\n continue\n elif data_type == \"string\":\n meaning = get_meaning(code_name, val)\n if not meaning.empty:\n fields_sparse_code.loc[fields_sparse_code[columnName] == val, \n columnName] = None\nfields_sparse_code.head()", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "code", "source": "fields_sparse_code.to_csv(\"extracted_data_with_sparse_code_drop.csv\", index=False)", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "markdown", "source": "### 7. Replace the column titles (field names) of extracted data with the field titles", "metadata": {}}, {"cell_type": "code", "source": "current_columns = list(fields_file_df.columns)", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "code", "source": "new_columns = {}\ntitles = []\nduplicate_titles = []\nfor val in current_columns:\n meaning = data_dict_df.loc[data_dict_df[\"ent_field\"]==val, \n \"title\"].values.item()\n if meaning not in titles:\n titles.append(meaning)\n elif meaning not in duplicate_titles:\n duplicate_titles.append(meaning)\nfor val in current_columns:\n meaning = data_dict_df.loc[data_dict_df[\"ent_field\"]==val, \n \"title\"].values.item()\n if meaning not in duplicate_titles:\n new_columns[val] = meaning\n else:\n new_columns[val] = val.replace(\".\", \"-\")", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "code", "source": "fields_file_df.rename(columns = new_columns, inplace = True)\nfields_file_df.head()", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "code", "source": "fields_file_df.to_csv(\"extracted_data_with_updated_titles.csv\", index=False)", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "markdown", "source": "### 8. Upload extracted dictionaries and data back to the project", "metadata": {}}, {"cell_type": "code", "source": "cmd = \"dx upload *.csv\"\nsubprocess.check_call(cmd, shell=True)", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}]} \ No newline at end of file +{"metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5"}}, "nbformat_minor": 5, "nbformat": 4, "cells": [{"cell_type": "markdown", "source": "# \u201cdx extract_dataset\u201d in Python\n
\n***As-Is Software Disclaimer***\n\nThis content in this repository is delivered \u201cAs-Is\u201d. Notwithstanding anything to the contrary, DNAnexus will have no warranty, support, liability or other obligations with respect to Materials provided hereunder.\n\n
\n\nThis notebook demonstrates usage of the dx command `extract_dataset` for:\n* Retrieval of Apollo-stored data, as referenced within entities and fields of a Dataset or Cohort object on the platform\n* Retrieval of the underlying data dictionary files used to generate a Dataset object on the platform\n\nMIT License applies to this notebook.", "metadata": {}}, {"cell_type": "markdown", "source": "## Preparing your environment\n### Launch spec:\n\n* App name: JupyterLab with Python, R, Stata, ML ()\n* Kernel: Python\n* Instance type: mem1_ssd1_v2_x2\n* Cost: < $0.2\n* Runtime: =~ 10 min\n* Data description: Input for this notebook is a v3.0 Dataset or Cohort object ID", "metadata": {}}, {"cell_type": "markdown", "source": "### dxpy version\nextract_dataset requires dxpy version >= 0.329.0. If running the command from your local environment (i.e. off of the DNAnexus platform), it may be required to also install pandas. For example, pip3 install -U dxpy[pandas]", "metadata": {}}, {"cell_type": "code", "source": "import subprocess\nimport dxpy\nimport pandas as pd\nimport os\nimport glob\npd.set_option('display.max_columns', None)", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "code", "source": "dxpy.__version__", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "markdown", "source": "### 1. Assign environment variables", "metadata": {}}, {"cell_type": "code", "source": "# The referenced Dataset is private and provided only to demonstrate an example input. The user will need to supply a permissible and valid record-id\n# Assign project-id of dataset\npid = 'project-G5BzYk80kP5bvbXy5J7PQZ36'\n# Assign dataset record-id\nrid = 'record-GJ3Y7jQ0VKyy592yPxB4yG7Y'\n# Assign joint dataset project-id:record-id\ndataset = (':').join([pid, rid])", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "markdown", "source": "### 2. Call \u201cdx extract_dataset\u201d using a supplied dataset", "metadata": {}}, {"cell_type": "code", "source": "cmd = [\"dx\", \"extract_dataset\", dataset, \"-ddd\", \"--delimiter\", \",\"]\nsubprocess.check_call(cmd)", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "markdown", "source": "#### Preview data in the three dictionary (*.csv) files", "metadata": {}}, {"cell_type": "code", "source": "path = os.getcwd()", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "code", "source": "data_dict_csv = glob.glob(os.path.join(path, \"*.data_dictionary.csv\"))[0]\ndata_dict_df = pd.read_csv(data_dict_csv)\ndata_dict_df.head()", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "code", "source": "codings_csv = glob.glob(os.path.join(path, \"*.codings.csv\"))[0]\ncodings_df = pd.read_csv(codings_csv)\ncodings_df.head()", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "code", "source": "entity_dict_csv = glob.glob(os.path.join(path, \"*.entity_dictionary.csv\"))[0]\nentity_dict_df = pd.read_csv(entity_dict_csv)\nentity_dict_df.head()", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "markdown", "source": "### 3. Parse returned metadata and extract entity/field names", "metadata": {}}, {"cell_type": "code", "source": "data_dict_df['ent_field'] = data_dict_df['entity'].astype(str) + \\\n '.' + data_dict_df['name'].astype(str)\n \nentity_field = data_dict_df.ent_field.values.tolist()\nentity_field[:10]", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "markdown", "source": "### 4. Use extracted entity and field names as input to the called function, \u201cdx extract_dataset\u201d and extract data", "metadata": {}}, {"cell_type": "code", "source": "df_list = []\nfor entity in entity_field: \n cmd = [\"dx\", \"extract_dataset\", dataset, \"--fields\", entity, \n \"-o\", entity + \"_extracted_data.csv\"]\n subprocess.check_call(cmd)\n\n df_list.append(pd.read_csv(entity + \"_extracted_data.csv\"))\nresult = pd.concat(df_list, axis=1)\nresult.to_csv(\"final_extracted_data.csv\")", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "markdown", "source": "#### Print data in the retrieved data file", "metadata": {}}, {"cell_type": "code", "source": "fields_file_df = pd.read_csv(\"extracted_data.csv\", float_precision='round_trip')\nfields_file_df.head()", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "markdown", "source": "### 5. Replace any coded column values of extracted data with the coded meaning", "metadata": {}}, {"cell_type": "code", "source": "data_dict_df['coding_value_type'] = data_dict_df['is_multi_select'].apply(\n lambda x: \"list\" if x == \"yes\" else \"string\")", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "code", "source": "fields_file_df_decoded = fields_file_df.copy(deep=True)\n\ndef get_meaning(code_name, code):\n if isinstance(code, int):\n code = str(code)\n elif isinstance(code, float):\n code = str(code)\n # If field type is float, and an integer sparse code is used for the field \n # (example `1`), the retrieved data represents it as a float (`1.0`).\n # Strip the `.0` suffix and search for the code in codings dataframe\n if codings_df.loc[(codings_df[\"coding_name\"]== code_name) & \n (codings_df[\"code\"]== code), \"meaning\"].empty:\n if code.endswith('.0'):\n code = code[:-2]\n return(codings_df.loc[(codings_df[\"coding_name\"]== code_name) & \n (codings_df[\"code\"]== code), \"meaning\"])\n\nfor (columnName, columnData) in fields_file_df_decoded.iteritems():\n code_name, data_type= data_dict_df[(data_dict_df[\"ent_field\"]==columnName)][\n [\"coding_name\", \"coding_value_type\"]].values[0]\n if not pd.isna(code_name):\n set_of_values = set(columnData.dropna())\n for val in set_of_values:\n if data_type == \"list\":\n new_val = []\n list_val = eval(val)\n for i in list_val:\n meaning = get_meaning(code_name, i)\n if not meaning.empty:\n new_val.append(meaning.values.item())\n else:\n new_val.append(i)\n fields_file_df_decoded.loc[fields_file_df_decoded[columnName] == val, \n columnName] = str(new_val)\n continue\n elif data_type == \"string\":\n meaning = get_meaning(code_name, val)\n if not meaning.empty:\n fields_file_df_decoded.loc[fields_file_df_decoded[columnName] == val, \n columnName] = meaning.values.item()\nfields_file_df_decoded.head()", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "code", "source": "fields_file_df_decoded.to_csv(\"extracted_data_with_code_meanings.csv\", index=False)", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "markdown", "source": "### 6. Drop sparsely coded values", "metadata": {}}, {"cell_type": "code", "source": "fields_sparse_code = fields_file_df.copy(deep=True)", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "code", "source": "for (columnName, columnData) in fields_sparse_code.iteritems():\n code_name, data_type, is_sparse_coding= data_dict_df[\n (data_dict_df[\"ent_field\"]==columnName)][\n [\"coding_name\", \"coding_value_type\", \"is_sparse_coding\"]].values[0]\n if not (pd.isna(code_name) and pd.isna(is_sparse_coding)) and \\\n is_sparse_coding=='yes':\n set_of_values = set(columnData.dropna())\n for val in set_of_values:\n if data_type == \"list\":\n new_val = []\n list_val = eval(val)\n for i in list_val:\n meaning = get_meaning(code_name, i)\n if meaning.empty:\n new_val.append(i)\n fields_sparse_code.loc[fields_sparse_code[columnName] == val, \n columnName] = str(new_val)\n continue\n elif data_type == \"string\":\n meaning = get_meaning(code_name, val)\n if not meaning.empty:\n fields_sparse_code.loc[fields_sparse_code[columnName] == val, \n columnName] = None\nfields_sparse_code.head()", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "code", "source": "fields_sparse_code.to_csv(\"extracted_data_with_sparse_code_drop.csv\", index=False)", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "markdown", "source": "### 7. Replace the column titles (field names) of extracted data with the field titles", "metadata": {}}, {"cell_type": "code", "source": "current_columns = list(fields_file_df.columns)", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "code", "source": "new_columns = {}\ntitles = []\nduplicate_titles = []\nfor val in current_columns:\n meaning = data_dict_df.loc[data_dict_df[\"ent_field\"]==val, \n \"title\"].values.item()\n if meaning not in titles:\n titles.append(meaning)\n elif meaning not in duplicate_titles:\n duplicate_titles.append(meaning)\nfor val in current_columns:\n meaning = data_dict_df.loc[data_dict_df[\"ent_field\"]==val, \n \"title\"].values.item()\n if meaning not in duplicate_titles:\n new_columns[val] = meaning\n else:\n new_columns[val] = val.replace(\".\", \"-\")", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "code", "source": "fields_file_df.rename(columns = new_columns, inplace = True)\nfields_file_df.head()", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "code", "source": "fields_file_df.to_csv(\"extracted_data_with_updated_titles.csv\", index=False)", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}, {"cell_type": "markdown", "source": "### 8. Upload extracted dictionaries and data back to the project", "metadata": {}}, {"cell_type": "code", "source": "cmd = \"dx upload *.csv\"\nsubprocess.check_call(cmd, shell=True)", "metadata": {"trusted": true}, "execution_count": null, "outputs": []}]}