From b29a564956d3cd65936008bce6951e38382bdfc8 Mon Sep 17 00:00:00 2001 From: loubnabnl Date: Wed, 18 Oct 2023 20:50:16 +0000 Subject: [PATCH 1/2] add noetbook --- .../notebook_analysis/scan_templates.ipynb | 1977 +++++++++++++++++ 1 file changed, 1977 insertions(+) create mode 100644 data_analysis/notebook_analysis/scan_templates.ipynb diff --git a/data_analysis/notebook_analysis/scan_templates.ipynb b/data_analysis/notebook_analysis/scan_templates.ipynb new file mode 100644 index 0000000..98fde8b --- /dev/null +++ b/data_analysis/notebook_analysis/scan_templates.ipynb @@ -0,0 +1,1977 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/fsx/loubna/miniconda3/envs/eval-harness/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "Resolving data files: 100%|██████████| 64/64 [00:00<00:00, 236090.99it/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import json\n", + "import pandas as pd\n", + "from datasets import load_dataset, Dataset\n", + "\n", + "ds = load_dataset(\"bigcode/jupyter_scripts_dedup\", split=\"train\", streaming=True)\n", + "ds = list(ds.take(10_000))\n", + "ds = Dataset.from_pandas(pd.DataFrame(data=ds))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map (num_proc=24): 100%|██████████| 10000/10000 [00:14<00:00, 680.55 examples/s]\n" + ] + } + ], + "source": [ + "import json\n", + "\n", + "TEMPLATES = [\"# YOUR CODE HERE\", \"TODO: implement this function\", \"# TODO: Implement me\", \"# TODO: Write your implementation here\"]\n", + "\n", + "def detect_template(example):\n", + " content = example[\"script\"]\n", + " dict_templates = []\n", + " for template in TEMPLATES:\n", + " if template.lower() in content.lower():\n", + " has_template = True\n", + " dict_templates.append({\"template\": template, \"index\": content.lower().index(template.lower())})\n", + " return {\"templates\": json.dumps(dict_templates) if dict_templates else \"\"}\n", + "\n", + "ds = ds.map(detect_template, num_proc=24)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Filter: 100%|██████████| 10000/10000 [00:00<00:00, 19637.45 examples/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['path', 'content_id', 'detected_licenses', 'license_type', 'repo_name', 'repo_url', 'star_events_count', 'fork_events_count', 'gha_license_id', 'gha_event_created_at', 'gha_updated_at', 'gha_language', 'language', 'is_generated', 'is_vendor', 'conversion_extension', 'size', 'script', 'script_size', 'templates'],\n", + " num_rows: 214\n", + "})" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_w_templates = ds.filter(lambda x: x[\"templates\"])\n", + "data_w_templates" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.14% of the files have these templates\n" + ] + } + ], + "source": [ + "print(f\"{len(data_w_templates) * 100/10_000}% of the files have these templates\")" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Counter({'# YOUR CODE HERE': 209,\n", + " '# TODO: Implement me': 2,\n", + " 'TODO: implement this function': 3})" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from collections import Counter\n", + "\n", + "templates_found = [json.loads(x)[0][\"template\"] for x in data_w_templates[\"templates\"]]\n", + "Counter(templates_found)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'template': '# YOUR CODE HERE', 'index': 19972}]\n", + "# ---\n", + "# jupyter:\n", + "# jupytext:\n", + "# text_representation:\n", + "# extension: .py\n", + "# format_name: light\n", + "# format_version: '1.5'\n", + "# jupytext_version: 1.15.2\n", + "# kernelspec:\n", + "# display_name: Python 3\n", + "# language: python\n", + "# name: python3\n", + "# ---\n", + "\n", + "import math\n", + "\n", + "\n", + "def primeFact(n):\n", + " while(n % 2 == 0):\n", + " n = n // 2\n", + " yield 2\n", + " \n", + " for i in range(3,round(math.sqrt(n)),2):\n", + " while(n % i == 0):\n", + " n = n // i\n", + " yield i\n", + " \n", + " if n > 2:\n", + " yield n\n", + "\n", + "\n", + "for i in primeFact(81):\n", + " print(i)\n", + "\n", + "primeFact(81)\n", + "\n", + "\n", + "th labels, etc.\n", + "#\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "plt.plot([2, 4, 6, 4])\n", + "plt.ylabel('Numbers')\n", + "plt.xlabel('Indices')\n", + "plt.title('Myplot')\n", + "plt.show()\n", + "\n", + "# If you provide a single list or array to the plot() command, matplotlib assumes it is a sequence of y values, and automatically generates the x values for you. Since python ranges start with 0, the default x vector has the same length as y but starts with 0. Hence the x data are [0,1,2,3].\n", + "#\n", + "\n", + "# **plot x versus y**\n", + "\n", + "plt.plot([1, 2, 3, 4], [1, 4, 9, 16])\n", + "plt.ylabel('squares')\n", + "plt.xlabel('numbers')\n", + "plt.grid() #grid on\n", + "plt.show()\n", + "\n", + "# For every x, y pair of arguments, there is an optional third argument which is the **format string** that indicates the color and line type of the plot. \n", + "\n", + "plt.plot([1, 2, 3, 4], [1, 4, 9, 16], 'ro') #ro- red-dot(o shape)\n", + "plt.grid()\n", + "plt.show()\n", + "\n", + "# If matplotlib were limited to working with lists, it would be fairly useless for numeric processing. Generally, you will use **numpy arrays**. In fact, all sequences are converted to numpy arrays internally.\n", + "#\n", + "\n", + "import numpy as np\n", + "\n", + "# +\n", + "t = np.arange(0., 5., 0.2)\n", + "\n", + "# blue dashes, red squares and green triangles\n", + "plt.plot(t, t**2, 'b--', label='^2')\n", + "plt.plot(t, t**2.2, 'rs', label='^2.2')\n", + "plt.plot(t, t**2.5, 'g^', label='^2.5')\n", + "plt.grid()\n", + "plt.legend() #add legend based on line labels\n", + "plt.show()\n", + "# -\n", + "\n", + "# ## Controlling line properties\n", + "#\n", + "#\n", + "# **use keyword args**\n", + "\n", + "x = [1, 2, 3, 4]\n", + "y = [1, 4, 9, 16]\n", + "plt.plot(x, y, linewidth=5.0)\n", + "plt.show()\n", + "\n", + "# **use the setp()**\n", + "\n", + "# +\n", + "x1 = [1, 2, 3, 4]\n", + "y1 = [1, 4, 9, 16]\n", + "x2 = [1, 2, 3, 4]\n", + "y2 = [2, 4, 6, 8]\n", + "lines = plt.plot(x1, y1, x2, y2)\n", + "\n", + "# use keyword args\n", + "plt.setp(lines[0], color='r', linewidth=2.0) #setp= set properties\n", + "\n", + "# or MATLAB style string value pairs\n", + "plt.setp(lines[1], 'color', 'g', 'linewidth', 2.0)\n", + "\n", + "plt.grid()\n", + "\n", + "\n", + "# -\n", + "\n", + "# ## working with multiple figures and axes\n", + "#\n", + "\n", + "# +\n", + "def f(t):\n", + " return np.exp(-t) * np.cos(2*np.pi*t)\n", + "\n", + "t1 = np.arange(0.0, 5.0, 0.1)\n", + "t2 = np.arange(0.0, 5.0, 0.02)\n", + "\n", + "plt.figure(1)\n", + "\n", + "# the subplot() command specifies numrows, numcols, \n", + "# fignum where fignum ranges from 1 to numrows*numcols.\n", + "plt.subplot(211)\n", + "plt.grid()\n", + "plt.plot(t1, f(t1), 'b--')\n", + "\n", + "plt.subplot(212)\n", + "plt.plot(t2, np.cos(2*np.pi*t2), 'r--')\n", + "plt.show()\n", + "\n", + "# +\n", + "plt.figure(1) #the first figure\n", + "plt.subplot(211) #the first subplot in the first figure\n", + "plt.plot([1, 2, 3])\n", + "plt.subplot(212) #the second subplot in the first figure\n", + "\n", + "plt.figure(2) #a second figure\n", + "plt.plot([4, 5, 6]) #creates a subplot(111) by default\n", + "\n", + "plt.figure(1) #figure 1 current; subplot(212 still current\n", + "plt.subplot(211) #make subplot(211) in figure 1 current\n", + "plt.title('Easy as 1, 2, 3') #subplot 211 title\n", + "plt.show()\n", + "tives using autograd functionality `grad()`:\n", + "# \n", + "# 1. **(1 pt.)** Plot `ReLU`, `ELU` ($\\alpha = 1$), `Softplus` ($\\beta = 1$) and `Sign`, `Sigmoid`, `Softsign`, `Tanh`.\n", + "# Which of these functions may be, and which - definitely, are a poor choise as an activation function in a neural network? Why?\n", + "\n", + "# +\n", + "# %matplotlib inline\n", + "import torch.nn.functional as F\n", + "import matplotlib.pyplot as plt\n", + "import torch\n", + "\n", + "x = torch.arange(-2, 2, .01, requires_grad=True)\n", + "x_np = x.detach().numpy()\n", + "x.sum().backward() # to create x.grad\n", + "\n", + "f, axes = plt.subplots(2, 2, sharex=True, figsize=(14, 5))\n", + "axes[0, 0].set_title('Values')\n", + "axes[0, 1].set_title('Derivatives')\n", + "\n", + "for i, function_set in (0, (('ReLU', F.relu), ('ELU', F.elu), ('Softplus', F.softplus))), \\\n", + " (1, (('Sign', torch.sign), ('Sigmoid', torch.sigmoid), ('Softsign', F.softsign), ('Tanh', torch.tanh))):\n", + " for function_name, activation in function_set:\n", + " ### BEGIN Solution\n", + " x.grad.zero_()\n", + " xs = x.detach().numpy()\n", + " values = activation(x).data.numpy()\n", + " activation(x).sum().backward()\n", + " derivatives = x.grad.data.numpy()\n", + " axes[i, 0].plot(xs, values, label = function_name)\n", + " axes[i, 1].plot(xs, derivatives , label=function_name)\n", + " ### END Solution\n", + "\n", + " axes[i, 0].legend()\n", + " axes[i, 1].legend()\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n", + "# -\n", + "\n", + "# ### Task 1.2. MNIST classification. (4 points)\n", + "#\n", + "# At the **Seminar 10** on neural networks, we built an MLP (Multilayer perceptron) with one hidden layer using our numpy implementations of linear layer and logistic and softmax activation functions. Your task is to\n", + "#\n", + "# 1. **(1 pt.)** Implement the MPL modules, including the Softmax cross entropy between `logits` and `labels`.\n", + "# 2. **(2 pt.)** Train our numpy realization of MLP to classify the MNIST from `sklearn.datasets()`. Getting accuracy on validation `> 90%`.\n", + "# 3. **(1 pt.)** Compare the acccuracy of classification to your scores from `Part 1` with and without dimensionality reduction. Will be this comparison fair? :) Derive the confusion matrix for all digits classes. What digits were predicted better or worse, than others?\n", + "#\n", + "\n", + "from sklearn.metrics import confusion_matrix, accuracy_score\n", + "from sklearn.datasets import load_digits\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# +\n", + "# fetch the dataset.\n", + "digits, targets = load_digits(return_X_y=True)\n", + "digits = digits.astype(np.float32) / 255\n", + "\n", + "digits_train, digits_test, targets_train, targets_test = train_test_split(digits, targets, random_state=0)\n", + "\n", + "train_size = digits_train.shape[0]\n", + "\n", + "input_size = 8*8\n", + "classes_n = 10\n", + "# -\n", + "\n", + "digits.shape\n", + "\n", + "\n", + "# **Implement the MLP with backprop.**\n", + "\n", + "# +\n", + "class Linear:\n", + " def __init__(self, input_size, output_size):\n", + " self.thetas = np.random.randn(input_size, output_size)\n", + " self.thetas_grads = np.empty_like(self.thetas)\n", + " self.bias = np.random.randn(output_size).reshape(-1, 1)\n", + " self.bias_grads = np.empty_like(self.bias).reshape(-1, 1)\n", + "\n", + " def forward(self, x): \n", + " self.x = x\n", + "# print('forward_linear')\n", + "# print('linear forward, x shape: ', x.shape)\n", + "# print('linear forward, thetas.shape: ',self.thetas.shape)\n", + "# print('linear forward bias shape: ', self.bias.shape)\n", + " return np.dot(x, self.thetas.T) + self.bias\n", + " \n", + " def backward(self, x, output_grad):\n", + " ### BEGIN Solution\n", + " print('backward_linear')\n", + " print('backward_linear output_grad.shape: ', output_grad.shape)\n", + " print('backward_linear thetas.shape: ', self.thetas.shape)\n", + " print('backward_linear bias_grads shape: ', self.bias_grads.shape)\n", + " print('backprob x shape', x.shape)\n", + " print('backprob self.x shape', self.x.shape)\n", + " self.thetas_grads += np.dot(self.x, output_grad.T)\n", + " self.bias_grads += np.sum(output_grad, axis=1, keepdims=True)\n", + " ### END Solution\n", + " return np.dot(self.thetas, output_grad)\n", + " \n", + "class LogisticActivation:\n", + " def forward(self, x):\n", + " #print('forward_logistic')\n", + " self.x = x\n", + " self.output = 1.0/(1 + np.exp(-x))\n", + " return self.output\n", + "\n", + "\n", + " def backward(self, x, output_grad):\n", + " ### BEGIN Solution\n", + " #print('backward_logistic')\n", + " a = self.output.reshape(-1, 1)\n", + " b = 1 - a\n", + " c = a*b\n", + " input_grad = c.T * output_grad\n", + " ### END Solution\n", + " return input_grad\n", + " \n", + "\n", + "class MLP:\n", + " def __init__(self, input_size, hidden_layer_size, output_size):\n", + " self.linear1 = Linear(input_size, hidden_layer_size)\n", + " self.activation1 = LogisticActivation()\n", + " self.linear2 = Linear(hidden_layer_size, output_size)\n", + " \n", + " \n", + " def forward(self, x):\n", + " #print('forward_mlp')\n", + " asd = self.linear2.forward(self.activation1.forward(self.linear1.forward(x)))\n", + " return np.exp(asd) / np.sum(np.exp(asd), axis = 0, keepdims=True)\n", + "\n", + "\n", + " def backward(self, x, output_grad):\n", + " #print('backward_mlp')\n", + " output = self.linear1.backward(x, self.activation1.backward(x,self.linear2.backward(x,output_grad)))\n", + " return output\n", + "\n", + "\n", + "# +\n", + "### BEGIN Solution \n", + "def softmax_crossentropy_with_logits(logits, reference_answers):\n", + " logits_vec = np.zeros(logits.shape)\n", + " logits_vec[reference_answers, :] = 1\n", + " return -np.sum(logits_vec*np.log(logits) + (1-logits_vec)*np.log(1-logits), axis=0)\n", + "\n", + "\n", + "def grad_softmax_crossentropy_with_logits(logits, reference_answers):\n", + " logits_vec = np.zeros(logits.shape)\n", + " logits_vec[reference_answers, :] = 1\n", + " out = logits_vec - logits\n", + " return out\n", + "### END Solution\n", + "\n", + "\n", + "# +\n", + "def softmax_crossentropy_with_logits(logits,reference_answers):\n", + " \"\"\"Compute crossentropy from logits[batch,n_classes] and ids of correct answers\"\"\"\n", + " logits_for_answers = logits[np.arange(len(logits)),reference_answers]\n", + " \n", + " xentropy = - logits_for_answers + np.log(np.sum(np.exp(logits),axis=-1))\n", + " \n", + " return xentropy\n", + "\n", + "def grad_softmax_crossentropy_with_logits(logits,reference_answers):\n", + " \"\"\"Compute crossentropy gradient from logits[batch,n_classes] and ids of correct answers\"\"\"\n", + " ones_for_answers = np.zeros_like(logits)\n", + " ones_for_answers[np.arange(len(logits)),reference_answers] = 1\n", + " \n", + " softmax = np.exp(logits) / np.exp(logits).sum(axis=-1,keepdims=True)\n", + " \n", + " return (- ones_for_answers + softmax) / logits.shape[0]\n", + "\n", + "\n", + "# +\n", + "np.random.seed(42)\n", + "\n", + "mlp = MLP(input_size=input_size, hidden_layer_size=100, output_size=classes_n)\n", + "\n", + "epochs_n = 100\n", + "learning_curve = [0] * epochs_n\n", + "test_curve = [0] * epochs_n\n", + "\n", + "x_train = digits_train\n", + "x_test = digits_test\n", + "y_train = targets_train\n", + "y_test = targets_test\n", + "\n", + "learning_rate = 1e-2\n", + "\n", + "for epoch in range(epochs_n):\n", + " \n", + " for sample_i in range(train_size):\n", + " x = x_train[sample_i].reshape(-1, 1)\n", + " #print('x shape: ',x.shape)\n", + " target = np.array([y_train[sample_i]])\n", + " ### BEGIN Solution\n", + " mlp.linear1.thetas_grads = np.zeros_like(mlp.linear1.thetas_grads)\n", + " mlp.linear2.thetas_grads = np.zeros_like(mlp.linear2.thetas_grads)\n", + " mlp.linear1.bias_grads = np.zeros_like(mlp.linear1.bias_grads)\n", + " mlp.linear2.bias_grads = np.zeros_like(mlp.linear2.bias_grads)\n", + " prediction = mlp.forward(x)\n", + " loss = softmax_crossentropy_with_logits(prediction, target)\n", + " print(prediction)\n", + " grad_loss = grad_softmax_crossentropy_with_logits(prediction, target)\n", + " learning_curve[epoch] += loss\n", + " mlp.backward(x, grad_loss)\n", + " \n", + " mlp.linear1.thetas -= mlp.linear1.thetas_grads * learning_rate\n", + " mlp.linear2.thetas -= mlp.linear2.thetas_grads * learning_rate\n", + " mlp.linear1.bias -= mlp.linear1.bias_grads * learning_rate\n", + " mlp.linear2.bias -= mlp.linear2.bias_grads * learning_rate\n", + " learning_curve[epoch] /= train_size\n", + " preds = []\n", + " #print(loss)\n", + " for sample_i in range(digits_test.shape[0]):\n", + " x = x_test[sample_i].reshape(-1, 1)\n", + " target = np.array([y_test[sample_i]])\n", + " pred = mlp.forward(x)\n", + " loss = softmax_crossentropy_with_logits(pred, target)\n", + " test_curve[epoch] += loss\n", + " preds.append(np.argmax(pred, axis=0))\n", + " acc = accuracy_score(preds, y_test)\n", + " test_curve[epoch] /=digits_test.shape[0]\n", + "\n", + " if epoch % 10 == 0:\n", + " print('Starting epoch {}'.format(epoch),', Accuracy on test: ', acc)\n", + "\n", + " print()\n", + "\n", + " ### END Solution\n", + "plt.plot(learning_curve,label='learning')\n", + "plt.plot(test_curve, label='test')\n", + "plt.legend()\n", + "# -\n", + "\n", + "# confusion matrix\n", + "predictions = mlp.forward(digits).argmax(axis = 1)\n", + "pd.DataFrame(confusion_matrix(targets, predictions))\n", + "# >>> your solution here <<<\n", + "\n", + "# - Number 1, 3, 9 and 7 were predicted worse compared to other numbers. Thus is due to some similarity between numbers. The only strange thing is that there are 18 confusions between '1' and '8'. This result is worse compared to KNN with PCA from part 1, but such comparison is not fare since PCA provides great advantage by culling low-variance features.\n", + "\n", + "# ## Task 2. Autoencoders. (7 points)\n", + "#\n", + "# ### Task 2.1. Autoencoder on the tabular data. (3 points)\n", + "#\n", + "#\n", + "# We will biuld the latent representation for tabular data with simple Autoencoding (AE) network. We will work with the cancer dataset from the scikit-learn package. You are to follow the instructions `1.0 - 1.6`. \n", + "#\n", + "# 1. **(1 pt.)** Implement the AE modules;\n", + "# 2. **(2 pt.)** Train AE to get latent representation of the cancer dataset from `sklearn.datasets()`. Use `MSE` loss and get < $0.28$ on validation, with AE \"bottleneck\" = $2$;\n", + "# 3. **(1 pt.)** Plot the latent representation of whole dataset in 2D and use colors to show object of differneet classes;\n", + "\n", + "# +\n", + "# imports\n", + "import sklearn.datasets as sk_data\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.utils.data as torch_data\n", + "# -\n", + "\n", + "# #### 1.0 Featch the data. Scale it and split on train and test.\n", + "\n", + "# +\n", + "cancer_dset = sk_data.load_breast_cancer()\n", + "\n", + "X = StandardScaler().fit_transform(cancer_dset['data'])\n", + "\n", + "print('Features: ', list(cancer_dset['feature_names']))\n", + "print('\\nShape:', X.shape)\n", + "\n", + "X_train, X_val, y_train, y_val = train_test_split(X, cancer_dset['target'], test_size=0.2, random_state=42)\n", + "print('\\nTrain size: ', len(X_train))\n", + "print('Validation size: ', len(X_val))\n", + "\n", + "\n", + "# -\n", + "\n", + "# #### 1.1 Let us firtly make the dataset, which we'll be able to use with pytorch dataloader. \n", + "# Implement the `__len__` and `__getitem__` methods.\n", + "\n", + "class CancerData(torch_data.Dataset):\n", + " def __init__(self, X, y):\n", + " super(CancerData, self).__init__()\n", + " self.X = torch.tensor(X, dtype=torch.float32)\n", + " self.y = torch.tensor(y, dtype=torch.float32)\n", + " \n", + " def __len__(self):\n", + " return self.X.shape[0]\n", + " \n", + " def __getitem__(self, idx):\n", + " return self.X[idx].unsqueeze(0), self.y[idx]\n", + "\n", + "\n", + "# +\n", + "train_dset = CancerData(X_train, y_train) \n", + "val_dset = CancerData(X_val, y_val) \n", + "\n", + "print(train_dset[5])\n", + "\n", + "\n", + "# -\n", + "\n", + "# #### 1.2 Now, we'll be making a base class for out autoencoder. \n", + "# It takes as input encoder and decoder (it will be tow neural networks). Using this two oblects, your task is to implement the forward pass.\n", + "\n", + "class MyFirstAE(nn.Module):\n", + " def __init__(self, encoder, decoder):\n", + " super(MyFirstAE, self).__init__()\n", + " self.encoder = encoder\n", + " self.decoder = decoder\n", + " \n", + " def forward(self, x):\n", + " \"\"\"\n", + " Take a mini-batch as an input, encode it to the latent space and decode back to the original space\n", + " x_out = decoder(encoder(x))\n", + " :param x: torch.tensor, (MB, x_dim)\n", + " :return: torch.tensor, (MB, x_dim)\n", + " \"\"\" \n", + " return self.decoder(self.encoder(x))# >>> your solution here <<<\n", + "\n", + "\n", + "# #### 1.3 It is time to create neural networks for encoder an decoder networks.\n", + "# Make hidden size of the network to be equal to `2`.\n", + "#\n", + "# **Hint.** You can use `nn.Sequential` to create your archtectures.\n", + "\n", + "# +\n", + "ss = 200\n", + "sample_size = X.shape[1]\n", + "\n", + "encoder = nn.Sequential(\n", + " nn.Linear(sample_size, ss*1),\n", + " \n", + " nn.ReLU(inplace=True),\n", + " nn.Linear(ss*1,2)\n", + " ) \n", + "\n", + "decoder = nn.Sequential(\n", + " nn.Linear(2, ss*1),\n", + " \n", + " nn.ReLU(inplace=True),\n", + " nn.Linear(ss*1, sample_size) \n", + " )\n", + "\n", + "# +\n", + "device = 'cpu'\n", + "\n", + "net = MyFirstAE(encoder, decoder) \n", + "criterion = nn.MSELoss()\n", + "optimizer = torch.optim.Adam(net.parameters(), lr=1e-2)\n", + "scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.7)\n", + "\n", + "\n", + "train_loader = torch_data.DataLoader(train_dset, batch_size=100, shuffle=True) \n", + "val_loader = torch_data.DataLoader(val_dset, batch_size=100, shuffle=False) \n", + "\n", + "\n", + "# -\n", + "\n", + "# #### 1.4 Implement the missing parts in the `train` function\n", + "\n", + "def train(epochs, net, criterion, optimizer, train_loader, val_loader,scheduler=None, verbose=True, save_dir=None):\n", + " net.to(device)\n", + " for epoch in range(1, epochs+1):\n", + " net.train()\n", + " for X, _ in train_loader:\n", + " # >>> your solution here <<<\n", + " X = X.to(device)\n", + " out = net(X)\n", + " loss = criterion(out, X)\n", + " \n", + " optimizer.zero_grad()\n", + " \n", + " loss.backward()\n", + " optimizer.step()\n", + " # defining NN evaluation\n", + " net.eval()\n", + " for X, _ in val_loader:\n", + " \n", + " # >>> your solution here <<<\n", + " X = X.to(device)\n", + " out = net(X)\n", + " val_loss = criterion(out, X)\n", + " \n", + " if scheduler is not None:\n", + " scheduler.step()\n", + " freq = max(epochs//20,1)\n", + " if verbose and epoch%freq==0:\n", + " print('Epoch {}/{} || Loss: Train {:.4f} | Validation {:.4f}'.format(epoch, epochs, loss.item(), val_loss.item()))\n", + "\n", + "\n", + "# #### 1.5 Train your AE on breast cancer dataset. \n", + "# Your goal is to get validation error < that 0.3.\n", + "#\n", + "# Some feartures that mey help you to improve performance:\n", + "# * `Dropout`\n", + "# * `Batchnorm`\n", + "# * lr scheduler (e.g. reduce learning)\n", + "# * Batch size\n", + "\n", + "# for `MSE` loss get < 0.28 on validation, with AE \"bottleneck\" = 2\n", + "train(100, net, criterion, optimizer, train_loader, val_loader, scheduler) \n", + "\n", + "# #### 1.6 Let's take a look at the latent space. \n", + "# Encode the whole dataset, using your AE, plot it in 2D and use colors to show object of differneet classes\n", + "\n", + "# +\n", + " ### BEGIN Solution\n", + "plt.figure(figsize=(14, 5))\n", + "net.eval()\n", + "\n", + "enc = net.encoder(torch.from_numpy(X).float()).detach().numpy()\n", + "plt.scatter(enc[:,0], enc[:,1], c=cancer_dset['target'], alpha=0.7);\n", + "plt.title('Latent space from the Autoencoder bottle neck, purple dots go for malignant samples. ');\n", + " ### END Solution\n", + "\n", + "\n", + "# -\n", + "\n", + "# ### Task 2.2. Autoencoder on kMNIST. (4 points)\n", + "#\n", + "#\n", + "# We will build the latent representation for `kMRIST` dataset, which you are already familiar with, using our AE network. We will work with the data from `Part 1`.\n", + "#\n", + "# 1. **(2 pt.)** Train AE to get latent representation of the `kMNIST` dataset from `sklearn.datasets()`. Follow the instructions `2.0 - 2.4`. Use `MSE` loss and get < $0.035$ on validation, with AE \"bottleneck\" < $40$;\n", + "# 2. **(1 pt.)** Plot 10 images and their reconstructions. Plot the latent representation of the whole dataset in 3D, compare visually to your manifold from `Part 1` and upload (pictures) screenshots to this notebook.\n", + "# 3. **(1 pt.)** Get the classification accuracy for the latest features. Use the `KNN` classifier with any hyperparameters you'll choose. Write a couple of sentences comparing the results of `manifold learning` and ` AE`. \n", + "# 3. **(BONUS 3 pt.)** Sampling from latent representation.\n", + "\n", + "# +\n", + "def load(f):\n", + " return np.load(f)['arr_0']\n", + "\n", + "# Load the data\n", + "x_train = load('data/kmnist/kmnist-train-imgs.npz')\n", + "x_test = load('data/kmnist/kmnist-test-imgs.npz')\n", + "y_train = load('data/kmnist/kmnist-train-labels.npz')\n", + "y_test = load('data/kmnist/kmnist-test-labels.npz')\n", + "\n", + "# Reshape the data and scale\n", + "x_train = x_train.reshape(-1, 28*28)/255# >>> your solution here <<<\n", + "x_test = x_test.reshape(-1, 28*28)/255# >>> your solution here <<<\n", + "# -\n", + "\n", + "fig, ax = plt.subplots(ncols=10, figsize=(20, 5))\n", + "for i in range(10):\n", + " ax[i].imshow(x_train[i].reshape(28,28));\n", + " ax[i].axis('off')\n", + "\n", + "\n", + "class kMNISTData(torch_data.Dataset):\n", + " def __init__(self, X, y):\n", + " super(kMNISTData, self).__init__()\n", + " self.X = torch.tensor(X, dtype=torch.float32) \n", + " self.y = y.astype(int)\n", + " \n", + " def __len__(self):\n", + " return self.X.shape[0]# >>> your solution here <<<\n", + " \n", + " def __getitem__(self, idx):\n", + " return self.X[idx].unsqueeze(0), self.y[idx]# >>> your solution here <<<\n", + "\n", + "\n", + "train_kmnist = kMNISTData(x_train, y_train) ### YOUR CODE HERE ###\n", + "test_kmnist = kMNISTData(x_test, y_test) ### YOUR CODE HERE ###\n", + "\n", + "# #### 2.0 Create encoder and decoder network for kMNIST. \n", + "# You can either use convolutions or flatten the images and use linear layers. You can choose hidden size (not larger than 40) and any architecture you like.\n", + "\n", + "x_train.shape\n", + "\n", + "sample_size = x_train.shape[1]\n", + "ss = 128\n", + "encoder = lambda hid: nn.Sequential(\n", + " nn.Linear(sample_size, ss*4),\n", + " nn.LeakyReLU(inplace=True),\n", + " nn.Dropout(0.2),\n", + " nn.Linear(ss*4, ss*2),\n", + " nn.LeakyReLU(inplace=True),\n", + " nn.Dropout(0.2),\n", + " nn.Linear(ss*2, hid)\n", + " )\n", + "decoder = lambda hid: nn.Sequential(\n", + " nn.Linear(hid, ss*2),\n", + " nn.LeakyReLU(inplace=True),\n", + " nn.Dropout(0.2),\n", + " nn.Linear(ss*2, ss*4),\n", + " nn.LeakyReLU(inplace=True),\n", + " nn.Dropout(0.2),\n", + " nn.Linear(ss*4, sample_size)\n", + " \n", + " ) \n", + "\n", + "\n", + "# #### 2.1 Train AE on the kMNIST. \n", + "# Your goal is to obtain MSE on the test set < $0.035$ on `MyFirstAE(encoder(40), decoder(40))`.\n", + "\n", + "# +\n", + "device = 'cpu'\n", + "\n", + "net = MyFirstAE(encoder(40), decoder(40)) \n", + "criterion = nn.MSELoss()\n", + "optimizer = torch.optim.Adam(net.parameters(), lr=1e-4)\n", + "scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.7)\n", + "\n", + "train_loader = torch_data.DataLoader(train_kmnist, batch_size=128, shuffle=True) \n", + "val_loader = torch_data.DataLoader(test_kmnist, batch_size=128, shuffle=False) \n", + "# -\n", + "\n", + "epochs = 100\n", + "train(epochs, net, criterion, optimizer, train_loader, val_loader, scheduler)\n", + "\n", + "# #### 2.2 Plot any 10 images and their reconstructions.\n", + "\n", + "fig, ax = plt.subplots(ncols=10, nrows=2, figsize=(20, 5))\n", + "for i in range(10):\n", + " im = train_kmnist[i][0]\n", + " rec = net(im.reshape(1, 28*28).to(device))\n", + " ax[0, i].imshow(im[0].reshape(28,28));\n", + " ax[1, i].imshow(rec.detach().numpy().reshape(28,28));\n", + " ax[0, i].axis('off')\n", + " ax[1, i].axis('off')\n", + "\n", + "# #### 2.3 Plot the latent representation of whole dataset in 3D, compare visually to your manifold from `Part 1` and upload (pictures) screenshoots to this notebook.\n", + "\n", + "# +\n", + "from mpl_toolkits.mplot3d import Axes3D\n", + "fig = plt.figure(figsize=(20, 20))\n", + "ax = Axes3D(fig)\n", + "\n", + "cdict = {0: 'red', 1: 'blue', 2: 'green', 3: 'olive', 4: 'magenta', 5: 'yellow', 6: 'black',\n", + " 7: 'orange', 8: 'pink', 9: 'purple'}\n", + "\n", + "target = np.concatenate((train_kmnist[:][1], test_kmnist[:][1]), axis=0)\n", + "all_data = np.concatenate((train_kmnist[:][0][0], test_kmnist[:][0][0]), axis=0)\n", + "rec = net.encoder(torch.from_numpy(all_data)).detach().numpy()\n", + "ax.scatter(rec[:, 0], rec[:, 1], rec[:, 2], c=target)\n", + "# -\n", + "\n", + "# PCA_kNN from part 1 3D represenation:![pca.png](attachment:pca.png)\n", + "\n", + "# Autoencoder 3D representation: ![autoencoder.png](attachment:autoencoder.png)\n", + "\n", + "# #### 2.4 Get the classification accuracy for the latest features. Use the `KNN` classifier with any hyperparameters you'll choose. Write a couple of sentences comparing the results of `manifold learning` and ` AE`. \n", + "\n", + "d = [rec[i, :] for i in range(rec.shape[0])]\n", + "df = pd.DataFrame(data=d)\n", + "df['target'] = target\n", + "\n", + "datka = df.drop('target', axis=1)\n", + "targ = df.target\n", + "\n", + "# +\n", + "from sklearn.model_selection import cross_val_score, StratifiedKFold\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "cv_score = []\n", + "sk = StratifiedKFold(n_splits=5)\n", + "\n", + "knn = KNeighborsClassifier(n_neighbors=10, n_jobs=-1)\n", + "cv_score.append(cross_val_score(knn, datka, targ, cv=sk).mean())\n", + "print('cv accuracy: ', np.max(cv_score))\n", + "# -\n", + "\n", + "# - KNN shown 1.5% higher accuracy compared to the case with PCA from Part 1. Isomap could have shown better results, but it's very computationaly heavy, this is why I used PCA in Part 1. Autoencoder, by the way, is very computationaly light(tooks less than 1 sec to encode-decode on my laptop)\n", + "\n", + "# ### BONUS: Sampling from latent space. (3 points)\n", + "#\n", + "# Imagine, that you want to generate images, using you AE. To do that, you need to sample from the latent space and then decode the result. Of course, we do not know the exact distribution of the latent space. Therefore, let us assume that latent space distribution is Gaussian:\n", + "#\n", + "# * Encode all the training images into the latent space\n", + "# * Calculate mean and covariance matrix ($\\mu$ and $\\Sigma$)\n", + "# * Sample $z$ from the $\\mathcal{N}(\\mu, \\Sigma)$\n", + "# * Decode $z$ from the previous step to obtain an image\n", + "#\n", + "# Using the procedure described above, sample 10 images and plot them (they should be similar to those of the initial dataset)\n", + "\n", + "target = train_kmnist[:][1]\n", + "encoded = net.encoder(train_kmnist[:][0][0]).detach().numpy()\n", + "#calc mean\n", + "mean = np.mean(encoded, axis=0)\n", + "#calc cov matrix\n", + "cov = np.cov(encoded.T)\n", + "#draw 10 samples\n", + "samples = np.random.multivariate_normal(mean, cov, 10)\n", + "fig, ax = plt.subplots(ncols=10, figsize=(20, 5))\n", + "samples = torch.from_numpy(samples)\n", + "#decode samples\n", + "decoded = net.decoder(samples.float()).detach().numpy()\n", + "for i in range(10):\n", + " ax[i].imshow(decoded[i].reshape(28,28));\n", + " ax[i].axis('off')\n", + "\n", + "# **Yeah, so you can get more than `100%` for the homework, good luck!**\n", + "\n" + ] + } + ], + "source": [ + "template = json.loads(data_w_templates[0][\"templates\"])\n", + "print(template)\n", + "print(data_w_templates[0][\"script\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'template': '# YOUR CODE HERE', 'index': 10310}]\n", + "# ---\n", + "# jupyter:\n", + "# jupytext:\n", + "# text_representation:\n", + "# extension: .py\n", + "# format_name: light\n", + "# format_version: '1.5'\n", + "# jupytext_version: 1.15.2\n", + "# kernelspec:\n", + "# display_name: Python 3\n", + "# name: python3\n", + "# ---\n", + "\n", + "# + [markdown] id=\"VRtcA8J29L7D\"\n", + "# Kimlik doğrulama\n", + "\n", + "# + colab={\"base_uri\": \"https://localhost:8080/\"} id=\"n_G9X5Xk827n\" executionInfo={\"status\": \"ok\", \"timestamp\": 1622372828044, \"user_tz\": -180, \"elapsed\": 17987, \"user\": {\"displayName\": \"\\u0130smail \\u00d6zdere\", \"photoUrl\": \"https://lh3.googleusercontent.com/a-/AOh14GhGk3QzQtOA4pNSRRaCMWCcX4YisHfLwlFlv47xDQ=s64\", \"userId\": \"01635209815510460280\"}} outputId=\"4e9c899b-2c88-4d03-b443-50adeef31070\"\n", + "from google.colab import drive\n", + "drive.mount(\"/gdrive\")\n", + "# %cd / gdrive\n", + "\n", + "# + [markdown] id=\"PVlQ1RhX9Qbv\"\n", + "# # Kütüphane kurulumları ve ağların oluşturulması\n", + "\n", + "# + id=\"sFTronKs9HuB\" executionInfo={\"status\": \"ok\", \"timestamp\": 1622372897657, \"user_tz\": -180, \"elapsed\": 487, \"user\": {\"displayName\": \"\\u0130smail \\u00d6zdere\", \"photoUrl\": \"https://lh3.googleusercontent.com/a-/AOh14GhGk3QzQtOA4pNSRRaCMWCcX4YisHfLwlFlv47xDQ=s64\", \"userId\": \"01635209815510460280\"}}\n", + "from keras import Input , layers\n", + "\n", + "# + id=\"7dhzxSf49c_O\" executionInfo={\"status\": \"ok\", \"timestamp\": 1622372991905, \"user_tz\": -180, \"elapsed\": 5988, \"user\": {\"displayName\": \"\\u0130smail \\u00d6zdere\", \"photoUrl\": \"https://lh3.googleusercontent.com/a-/AOh14GhGk3QzQtOA4pNSRRaCMWCcX4YisHfLwlFlv47xDQ=s64\", \"userId\": \"01635209815510460280\"}}\n", + "input_tensor = Input((32 , ) )\n", + "dense = layers.Dense(32 , activation= \"relu\")\n", + "output_tensor = dense(input_tensor)\n", + "\n", + "# + id=\"tLe2ArG-9ypO\" executionInfo={\"status\": \"ok\", \"timestamp\": 1622373054615, \"user_tz\": -180, \"elapsed\": 599, \"user\": {\"displayName\": \"\\u0130smail \\u00d6zdere\", \"photoUrl\": \"https://lh3.googleusercontent.com/a-/AOh14GhGk3QzQtOA4pNSRRaCMWCcX4YisHfLwlFlv47xDQ=s64\", \"userId\": \"01635209815510460280\"}}\n", + "from keras.models import Sequential ,Model\n", + "\n", + "# + [markdown] id=\"Usm2YHve-DsF\"\n", + "# ## Sequantial Model\n", + "\n", + "# + colab={\"base_uri\": \"https://localhost:8080/\"} id=\"7_CMbzNo-DPM\" executionInfo={\"status\": \"ok\", \"timestamp\": 1622373376694, \"user_tz\": -180, \"elapsed\": 498, \"user\": {\"displayName\": \"\\u0130smail \\u00d6zdere\", \"photoUrl\": \"https://lh3.googleusercontent.com/a-/AOh14GhGk3QzQtOA4pNSRRaCMWCcX4YisHfLwlFlv47xDQ=s64\", \"userId\": \"01635209815510460280\"}} outputId=\"982bfdb9-9f56-4d7d-f051-d25a1e4b0b17\"\n", + "seq_model = Sequential()\n", + "seq_model.add(layers.Dense(32 , activation= \"relu\" , input_shape = (64, )))\n", + "seq_model.add(layers.Dense(32 , activation= \"relu\"))\n", + "seq_model.add(layers.Dense(10 , activation= \"softmax\"))\n", + "\n", + "seq_model.summary()\n", + "\n", + "# + [markdown] id=\"SIITOHiw_V5r\"\n", + "# ### ``` functional ``` Model\n", + "#\n", + "#\n", + "#\n", + "\n", + "# + colab={\"base_uri\": \"https://localhost:8080/\"} id=\"OZe3RNWK_G46\" executionInfo={\"status\": \"ok\", \"timestamp\": 1622373835555, \"user_tz\": -180, \"elapsed\": 470, \"user\": {\"displayName\": \"\\u0130smail \\u00d6zdere\", \"photoUrl\": \"https://lh3.googleusercontent.com/a-/AOh14GhGk3QzQtOA4pNSRRaCMWCcX4YisHfLwlFlv47xDQ=s64\", \"userId\": \"01635209815510460280\"}} outputId=\"ee476e5c-256f-499b-b05c-77c2ef207e55\"\n", + "input_tensor = Input(shape = 64 , )\n", + "\n", + "x = layers.Dense(32 , activation= \"relu\" )(input_tensor)\n", + "\n", + "x = layers.Dense(32 , activation= \"relu\")(x)\n", + "\n", + "output_tensor = layers.Dense(10 , activation= \"softmax\")(x)\n", + "\n", + "model = Model(input_tensor , output_tensor)\n", + "\n", + "model.summary()\n", + "\n", + "# + [markdown] id=\"BTQ2DOBxBKoQ\"\n", + "# ## Modelin Derlenmesi\n", + "\n", + "# + id=\"rv1L8Mp9BB2-\" executionInfo={\"status\": \"ok\", \"timestamp\": 1622373916990, \"user_tz\": -180, \"elapsed\": 330, \"user\": {\"displayName\": \"\\u0130smail \\u00d6zdere\", \"photoUrl\": \"https://lh3.googleusercontent.com/a-/AOh14GhGk3QzQtOA4pNSRRaCMWCcX4YisHfLwlFlv47xDQ=s64\", \"userId\": \"01635209815510460280\"}}\n", + "model.compile(optimizer= \"rmsprop\" , loss= \"categorical_crossentropy\")\n", + "\n", + "# + [markdown] id=\"4sZM36wDBWb8\"\n", + "# Eğitim için rasgele bir küme oluşturmak \n", + "\n", + "# + id=\"bVV6voVGBV3O\" executionInfo={\"status\": \"ok\", \"timestamp\": 1622373956783, \"user_tz\": -180, \"elapsed\": 501, \"user\": {\"displayName\": \"\\u0130smail \\u00d6zdere\", \"photoUrl\": \"https://lh3.googleusercontent.com/a-/AOh14GhGk3QzQtOA4pNSRRaCMWCcX4YisHfLwlFlv47xDQ=s64\", \"userId\": \"01635209815510460280\"}}\n", + "import numpy as np\n", + "\n", + "# + id=\"7ZE3utGTBfj0\" executionInfo={\"status\": \"ok\", \"timestamp\": 1622374075582, \"user_tz\": -180, \"elapsed\": 529, \"user\": {\"displayName\": \"\\u0130smail \\u00d6zdere\", \"photoUrl\": \"https://lh3.googleusercontent.com/a-/AOh14GhGk3QzQtOA4pNSRRaCMWCcX4YisHfLwlFlv47xDQ=s64\", \"userId\": \"01635209815510460280\"}}\n", + "x_train = np.random.random((1000 , 64))\n", + "y_train = np.random.random((1000 , 10))\n", + "\n", + "# + [markdown] id=\"f2zgeRM4BwXA\"\n", + "# ## Modelin Eğitilmesi\n", + "\n", + "# + colab={\"base_uri\": \"https://localhost:8080/\"} id=\"yZ8Xk1x-Bsqo\" executionInfo={\"status\": \"ok\", \"timestamp\": 1622374165363, \"user_tz\": -180, \"elapsed\": 19384, \"user\": {\"displayName\": \"\\u0130smail \\u00d6zdere\", \"photoUrl\": \"https://lh3.googleusercontent.com/a-/AOh14GhGk3QzQtOA4pNSRRaCMWCcX4YisHfLwlFlv47xDQ=s64\", \"userId\": \"01635209815510460280\"}} outputId=\"e81f41fc-40ba-4dce-a4db-0720b918c1a7\"\n", + "model.fit(x_train , y_train , epochs= 10 , batch_size= 128)\n", + "\n", + "score = model.evaluate(x_train , y_train)\n", + "\n", + "# + id=\"z6-C2N0tCN0X\"\n", + "\n", + "plot(ax=ax)\n", + "plt.show()\n", + "\n", + "# Oops! Where did the bike lanes go? Well, python uses a default color for all plots, so the bike paths were plotted on top of the polygon in the exact same color. Let's try to plot the bike lanes yellow.\n", + "\n", + "fig, ax = plt.subplots(figsize = (10,8)) \n", + "berkeley.plot(ax=ax)\n", + "bikes.plot(ax=ax, color=\"yellow\")\n", + "plt.show()\n", + "\n", + "# Now we have a map that shows where the bike network of the City of Berkeley is located.\n", + "\n", + "# \n", + "# ## 10.4 Read in data via a Python library (OSMnx)\n", + "#\n", + "# OSMnx is a Python library that lets you access Open Street Map's street networks through an API.\n", + "#\n", + "# You can explore more of Open Street Maps [here](https://www.openstreetmap.org/)\n", + "#\n", + "# You can access the full documentation of OSMnx [here](https://osmnx.readthedocs.io/en/stable/index.html)\n", + "\n", + "# +\n", + "# Uncomment to install library\n", + "# # !pip install osmnx\n", + "# -\n", + "\n", + "# If the below cell does not run, you need to install the library first, by uncommmenting and running the cell above\n", + "#\n", + "# > **Note**\n", + "# >\n", + "# > If you get a `numpy` associated error you may need to uninstall and reinstall `numpy` as well as set up tools. Run the following lines of code in your terminal:\n", + "# >\n", + "# pip uninstall -y numpy\n", + "# pip uninstall -y setuptools\n", + "# pip install setuptools\n", + "# pip install numpy\n", + "\n", + "import osmnx as ox\n", + "\n", + "# Now we can use the osmnx library to access data from Open Street Maps. Let's try to load the Berkeley street map. \n", + "# We are using the graph_from_place function. To see the full documentation for the function, go to this link: https://osmnx.readthedocs.io/en/stable/osmnx.html#osmnx.graph.graph_from_place.\n", + "#\n", + "#\n", + "# We need to define two arguments for the function: the **query** and the **network type**\n", + "#\n", + "# - **Query**: For cities in the US, the query should follow the following format: \"City Name, State Abbreviation, USA\"\n", + "# \n", + "# \n", + "# - **Network Type**: This is where we define which network we are interested in. Some of the available options are:\n", + "# - all\n", + "# - drive\n", + "# - walk\n", + "# - bike\n", + "#\n", + "\n", + "# Let's try to read the data for the vehicular network for Berkeley\n", + "\n", + "place = \"Berkeley, CA, USA\"\n", + "graph = ox.graph_from_place(place, network_type='drive')\n", + "\n", + "# This took a while to read. Let's take a look at how many elements were loaded from OSM for Berkeley\n", + "\n", + "len(graph)\n", + "\n", + "# Let's check the data type\n", + "\n", + "type(graph)\n", + "\n", + "# This is a new format. To get this into something that is familiar to us, we are going to extract the nodes and links by using the *graph_to_gdfs* function, which converts our data from a graph to two geodataframes. Because a street network is made up from nodes and links, and our geodatraframes can only have one geography type, the *graph_to_gdfs* returns 2 geodataframes: a node (point) and a street (line) geodataframe.\n", + "\n", + "nodes, streets = ox.graph_to_gdfs(graph)\n", + "streets.plot();\n", + "\n", + "# Now, let's try to put everything together in the same map (the limits of the city, the bike lanes and the streets)\n", + "\n", + "fig, ax = plt.subplots(figsize = (10,8)) \n", + "berkeley.plot(ax=ax)\n", + "streets.plot(ax=ax, color=\"grey\")\n", + "bikes.plot(ax=ax, color=\"yellow\")\n", + "plt.show()\n", + "\n", + "# Another feature that we can extract form OSMnx is the bus stops. To do this, we use the pois_from_place function (see full documentation [here](https://osmnx.readthedocs.io/en/stable/osmnx.html#osmnx.pois.pois_from_place))\n", + "#\n", + "# This function requires two arguments: the **query** (same as above) and the **tag**:\n", + "#\n", + "# - **Query**: For cities in the US, the query should follow the following format: \"City Name, State Abbreviation, USA\"\n", + "# \n", + "# \n", + "# - **Tag**: This is where we define which tags we are interested in. There are many options available. You can find a list of tag features [here](https://wiki.openstreetmap.org/wiki/Map_Features#Highway). These tags are coded as dictionaries. Bus stops are a value defined under the key highway, therefore, the format to call for bus stops looks like this: {'highway':'bus_stop'}\n", + "\n", + "# Let's access the bus stops using the same query defined for Berkeley\n", + "#\n", + "# > **Note**\n", + "# >\n", + "# >If you are using an older version of `osmnx` you would be able to use the function `pois_from_place`. This and other functions such as `footprints_from_place` are deprecated as of July 2020. `geometries_from_place` is meant to replace these functions.\n", + "\n", + "### fetch and map POIs from osmnx\n", + "busstops = ox.geometries_from_place(place, tags = {'highway':'bus_stop'})\n", + "\n", + "# Now, let's check the data type busstops was read as\n", + "\n", + "type(busstops)\n", + "\n", + "# As we can see, busstops is already a geodataframe. Therefore, we can plot it as it is unto out map.\n", + "\n", + "fig, ax = plt.subplots(figsize = (10,8)) \n", + "berkeley.plot(ax=ax)\n", + "streets.plot(ax=ax, color=\"grey\")\n", + "bikes.plot(ax=ax, color=\"yellow\")\n", + "busstops.plot(ax=ax, color=\"white\")\n", + "plt.show()\n", + "\n", + "# \n", + "# ## 10.5 Exercise\n", + "#\n", + "# Repeat above for SF. The link for accessing the bikeways for SF is already given to you below.\n", + "#\n", + "# ### SF Open Data portal\n", + "#\n", + "# https://datasf.org/opendata/\n", + "#\n", + "# #### SF Bike Network data\n", + "# https://data.sfgov.org/Transportation/SFMTA-Bikeway-Network/ygmz-vaxd\n", + "\n", + "sf_bike_ways = \"https://data.sfgov.org/api/geospatial/ygmz-vaxd?method=export&format=GeoJSON\"\n", + "\n", + "# +\n", + "# Your code here\n", + "# -\n", + "\n", + "# ## Double-click here to see solution!\n", + "#\n", + "# \n", + "\n", + "# \n", + "# ## 10.6 Read in Data from a CSV and convert to geodataframe\n", + "#\n", + "# In this example, we'll learn how to read a csv file with latitude and longitude coordinates and convert it to a geodataframe for plotting.\n", + "\n", + "# Read in CSV file\n", + "stations = pd.read_csv(\"notebook_data/transportation/bart.csv\")\n", + "stations.head()\n", + "\n", + "# We now want to convert the csv file into a Point geodataframe, so we can produce maps and access the geospatial analysis tools.\n", + "#\n", + "# We do this below with the geopandas `GeoDataFrame` function which takes as input\n", + "#\n", + "# 1. a pandas dataframe here `stations`, and\n", + "# 2. `geometry` for each row in the dataframe.\n", + "#\n", + "# We create the geometry using the geopandas `points_from_xy` function, using the data in the `lon` and `lat` columns of the pandas dataframe.\n", + "\n", + "# +\n", + "#Convert the DataFrame to a GeoDataFrame. \n", + "bart_gdf = gpd.GeoDataFrame(stations, geometry=gpd.points_from_xy(stations.lon, stations.lat)) \n", + "\n", + "# and take a look\n", + "bart_gdf.plot();\n", + "# -\n", + "\n", + "# Now we have a map of BART stations! You can use this approach with any CSV file that has columns of x,y coordinates.\n", + "\n", + "# ### 10.7 Exercises\n", + "#\n", + "#\n", + "#\n", + "# Set the CRS for `bart_gdf` to WGS84\n", + "\n", + "\n", + "\n", + "# Below is the url for the 2018 census county geographic boundary file.\n", + "#\n", + "# * Read in the county file\n", + "# * Subset on Marin County\n", + "# * Plot Marin County with the Bart stations you transformed\n", + "# * Question: what should do if the county name is not unique?\n", + "\n", + "# Census Counties file for the USA\n", + "county_file = \"https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_county_500k.zip\"\n", + "\n", + "# +\n", + "# Your code here\n", + "# -\n", + "\n", + "# ## Double-click here to see solution!\n", + "#\n", + "# \n", + "\n", + "# ---\n", + "#
\n", + "# \n", + "# \n", + "#
\n", + "#\n", + "#
\n", + "#
 D-Lab @ University of California - Berkeley
\n", + "#
 Team Geo
\n", + "#
\n", + "# \n", + "#\n", + "\n" + ] + } + ], + "source": [ + "template = json.loads(data_w_templates[1][\"templates\"])\n", + "print(template)\n", + "print(data_w_templates[1][\"script\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'template': '# YOUR CODE HERE', 'index': 6250}]\n", + "# ---\n", + "# jupyter:\n", + "# jupytext:\n", + "# text_representation:\n", + "# extension: .py\n", + "# format_name: light\n", + "# format_version: '1.5'\n", + "# jupytext_version: 1.15.2\n", + "# kernelspec:\n", + "# display_name: Python 3\n", + "# language: python\n", + "# name: python3\n", + "# ---\n", + "\n", + "# # Plotting and Functions\n", + "\n", + "# This notebook will work trough how to plot data and how to define functions. Throughout the lecture we will take a few moments to plot different functions and see how they depend on their parameters\n", + "\n", + "# ## Plotting in Python: Matplot \n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import scipy as sp\n", + "\n", + "# Pyplot is a powerful plotting library that can be used to make publication quaility plots. It is also useful for quikly plotting the results of a calcualtion. \n", + "#\n", + "# This is a quick demonstration of its use\n", + "#\n", + "# Note: when you call al library `import matplotlib.pyplot as plt` the way that use it is to do the following `plt.function()` where `function()` is whatever you are trying to call from the library\n", + "\n", + "# Define x and y values for some function\n", + "x = [i for i in range(20)]\n", + "y1 = [i**2 for i in x]\n", + "y2 = [i**3 for i in x]\n", + "\n", + "# The methods used above to make the lists is considered very *pythonic*. It works the same as a loop, but outputs all the results into a list. The left-hand most argument is what the list elements will be and the right hand side is the the way the loop will work.\n", + "\n", + "# When you use pyplot to make a plot, you can add more than one data set to the figure until you render the plot. Once you render the plot it resets\n", + "\n", + "plt.plot(x,y1)\n", + "plt.plot(x,y2)\n", + "plt.xlabel('X', fontsize=24)\n", + "plt.ylabel('Y', fontsize=24)\n", + "plt.legend(['Quadratic', 'Cubic'], loc=0)\n", + "plt.show()\n", + "\n", + "# We can call also use numpy fucntions to make our plots. Numpy is a very powerful math library\n", + "\n", + "# linspace will make a list of values from initial to final with however many increments you want\n", + "# this example goes from 0-2.5 with 20 increments\n", + "x=numpy.linspace(0,1.0,20)\n", + "print(x)\n", + "\n", + "exp_func=np.exp(-2*np.pi*x)\n", + "print(exp_func)\n", + "\n", + "plt.plot(x,exp_func, color=\"black\")\n", + "plt.xlabel('x', fontsize=24)\n", + "plt.ylabel(\"y(x)\", fontsize=24)\n", + "plt.show()\n", + "\n", + "# All aspects of the plot can be changed. The best way to figure out what you want to do is to go to the Matplotlib gallery and choose an image that looks like what you are trying to do.\n", + "#\n", + "# https://matplotlib.org/gallery/index.html\n", + "\n", + "# ### Example: Scatter plot with histograms\n", + "\n", + "# +\n", + "import numpy as np\n", + "\n", + "#Fixing random state for reproducibility\n", + "np.random.seed(19680801)\n", + "\n", + "# the random data\n", + "x = np.random.randn(1000)\n", + "y = np.random.randn(1000)\n", + "\n", + "# definitions for the axes\n", + "left, width = 0.1, 0.65\n", + "bottom, height = 0.1, 0.65\n", + "spacing = 0.005\n", + "\n", + "\n", + "rect_scatter = [left, bottom, width, height]\n", + "rect_histx = [left, bottom + height + spacing, width, 0.2]\n", + "rect_histy = [left + width + spacing, bottom, 0.2, height]\n", + "\n", + "# start with a rectangular Figure\n", + "plt.figure(figsize=(8, 8))\n", + "\n", + "ax_scatter = plt.axes(rect_scatter)\n", + "ax_scatter.tick_params(direction='in', top=True, right=True)\n", + "ax_histx = plt.axes(rect_histx)\n", + "ax_histx.tick_params(direction='in', labelbottom=False)\n", + "ax_histy = plt.axes(rect_histy)\n", + "ax_histy.tick_params(direction='in', labelleft=False)\n", + "\n", + "# the scatter plot:\n", + "ax_scatter.scatter(x, y)\n", + "\n", + "# now determine nice limits by hand:\n", + "binwidth = 0.25\n", + "lim = np.ceil(np.abs([x, y]).max() / binwidth) * binwidth\n", + "ax_scatter.set_xlim((-lim, lim))\n", + "ax_scatter.set_ylim((-lim, lim))\n", + "\n", + "bins = np.arange(-lim, lim + binwidth, binwidth)\n", + "ax_histx.hist(x, bins=bins)\n", + "ax_histy.hist(y, bins=bins, orientation='horizontal')\n", + "\n", + "ax_histx.set_xlim(ax_scatter.get_xlim())\n", + "ax_histy.set_ylim(ax_scatter.get_ylim())\n", + "\n", + "plt.show()\n", + "# -\n", + "\n", + "# I don't have to be an expert in making that kind of plot. I just have to understand and guess enough to figure out. I also google things I don't know\n", + "#\n", + "# https://www.google.com/search?client=firefox-b-1-d&q=pyplot+histogram+change+color\n", + "#\n", + "# https://stackoverflow.com/questions/42172440/python-matplotlib-histogram-color?rq=1\n", + "#\n", + "# https://matplotlib.org/examples/color/named_colors.html\n", + "#\n", + "# Then I can make small changes to have the plot look how I want it to look\n", + "#\n", + "# Notice below I changed \n", + "#\n", + "# `ax_scatter.scatter(x, y, color=\"purple\")`, \n", + "#\n", + "# `ax_histx.hist(x, bins=bins, color = \"skyblue\")`, \n", + "#\n", + "# `ax_histy.hist(y, bins=bins, orientation='horizontal', color=\"salmon\")`\n", + "\n", + "# +\n", + "#Fixing random state for reproducibility\n", + "np.random.seed(19680801)\n", + "\n", + "# the random data\n", + "x = np.random.randn(1000)\n", + "y = np.random.randn(1000)\n", + "\n", + "# definitions for the axes\n", + "left, width = 0.1, 0.65\n", + "bottom, height = 0.1, 0.65\n", + "spacing = 0.005\n", + "\n", + "\n", + "rect_scatter = [left, bottom, width, height]\n", + "rect_histx = [left, bottom + height + spacing, width, 0.2]\n", + "rect_histy = [left + width + spacing, bottom, 0.2, height]\n", + "\n", + "# start with a rectangular Figure\n", + "plt.figure(figsize=(8, 8))\n", + "\n", + "ax_scatter = plt.axes(rect_scatter)\n", + "ax_scatter.tick_params(direction='in', top=True, right=True)\n", + "ax_histx = plt.axes(rect_histx)\n", + "ax_histx.tick_params(direction='in', labelbottom=False)\n", + "ax_histy = plt.axes(rect_histy)\n", + "ax_histy.tick_params(direction='in', labelleft=False)\n", + "\n", + "# the scatter plot:\n", + "ax_scatter.scatter(x, y, color=\"purple\")\n", + "\n", + "# now determine nice limits by hand:\n", + "binwidth = 0.25\n", + "lim = np.ceil(np.abs([x, y]).max() / binwidth) * binwidth\n", + "ax_scatter.set_xlim((-lim, lim))\n", + "ax_scatter.set_ylim((-lim, lim))\n", + "\n", + "bins = np.arange(-lim, lim + binwidth, binwidth)\n", + "ax_histx.hist(x, bins=bins, color = \"skyblue\")\n", + "ax_histy.hist(y, bins=bins, orientation='horizontal', color=\"salmon\")\n", + "\n", + "ax_histx.set_xlim(ax_scatter.get_xlim())\n", + "ax_histy.set_ylim(ax_scatter.get_ylim())\n", + "\n", + "\n", + "\n", + "plt.show()\n", + "# -\n", + "\n", + "# Notice how I changed the colors on the plot based off of what I found on the stack exchange. The way to solve issues in the course and computational work is to google them.\n", + "\n", + "# ## Plotting Exersice 1\n", + "\n", + "# Find a plot from the gallery that you like. Then make some sort of noticable change to it.\n", + "\n", + "# +\n", + "# orginal plot here\n", + "\n", + "# +\n", + "# your new plot here\n", + "# -\n", + "\n", + "# ## Plotting Exersice 2\n", + "\n", + "# Plot a the following functions on the same plot from $ -2\\pi $ to $2\\pi$\n", + "#\n", + "# $$ \\sin(2\\pi x+\\pi)$$\n", + "# $$ \\cos(2\\pi x+\\pi)$$\n", + "# $$\\sin(2\\pi x+\\pi)+\\cos(2\\pi x+\\pi)$$\n", + "\n", + "# This might be useful:\n", + "# https://docs.scipy.org/doc/numpy/reference/generated/numpy.sin.html\n", + "# https://docs.scipy.org/doc/numpy/reference/generated/numpy.cos.html#numpy.cos\n", + "\n", + "# +\n", + "# Your code here\n", + "# -\n", + "\n", + "# # Lecture plots\n", + "\n", + "# Periodically during lecture we will take a pause to plot some of the interesting functions that we use in class.\n", + "\n", + "# ## Classical wavefunctions\n", + "#\n", + "# The following plot shows the the spacial component of the standard wavefunction with a wavelength of $\\lambda=\\text{1.45 m}$ and a relative amplitude of $A=1$ when the time, $t=0$ and the phase $\\phi=1.0$.\n", + "\n", + "x=numpy.linspace(0,3.0,100)\n", + "sinx=np.sin(2*np.pi*x+0+1)\n", + "plt.plot(x,sinx, color=\"black\")\n", + "plt.xlabel('x', fontsize=24)\n", + "plt.ylabel(\"y(x)\", fontsize=24)\n", + "plt.show()\n", + "\n", + "# Make a new figure where you plot the same wave function at three time points in the future. Assume the frequency is $\\nu=.1 \\text{ ms / s} $ Use a different color for each plot\n", + "\n", + "# +\n", + "# Your code here\n", + "# -\n", + "\n", + "# ## Orthogonality\n", + "\n", + "# Graphically show that the the following two functions are orthogonal on the interval $-3\\pi$ to $3\\pi$\n", + "# $$ \\sin(x) \\text{ and } \\cos(3x)$$\n", + "#\n", + "# Plot both functions together, then plot the product of both functions and explain why it is orthogonal\n", + "\n", + "# +\n", + "# Your plots here\n", + "\n", + "# +\n", + "prod=sinx*cos3x\n", + "\n", + "\n", + "\n", + "# -\n", + "\n", + "# Use the numpy trapezoid rule integrator to show the the two functions are orthogonal\n", + "# `np.trapz(y,x)`\n", + "#\n", + "# https://docs.scipy.org/doc/numpy/reference/generated/numpy.trapz.html\n", + "\n", + "# Example code\n", + "x=numpy.linspace(0,1.0,20)\n", + "exp_func=np.exp(-2*np.pi*x)\n", + "np.trapz(exp_func,x)\n", + "\n", + "# +\n", + "# Your code here\n", + "# -\n", + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "template = json.loads(data_w_templates[20][\"templates\"])\n", + "print(template)\n", + "print(data_w_templates[20][\"script\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'template': '# YOUR CODE HERE', 'index': 633}]\n", + "# ---\n", + "# jupyter:\n", + "# jupytext:\n", + "# text_representation:\n", + "# extension: .py\n", + "# format_name: light\n", + "# format_version: '1.5'\n", + "# jupytext_version: 1.15.2\n", + "# kernelspec:\n", + "# display_name: Python [conda env:python2]\n", + "# language: python\n", + "# name: conda-env-python2-py\n", + "# ---\n", + "\n", + "# +\n", + "import numpy as np\n", + "import random\n", + "import scipy.misc as sp\n", + "\n", + "from q1_softmax import softmax\n", + "from q2_gradcheck import gradcheck_naive\n", + "from q2_sigmoid import sigmoid, sigmoid_grad\n", + "\n", + "def normalizeRows(x):\n", + " \"\"\" Row normalization function \"\"\"\n", + " # Implement a function that normalizes each row of a matrix to have unit length\n", + " \n", + " ### YOUR CODE HERE\n", + " row_norm = np.sqrt(np.sum(x * x, axis=1, keepdims=True))\n", + " x = x / row_norm\n", + " ### END YOUR CODE\n", + " \n", + " return x\n", + "\n", + "def test_normalize_rows():\n", + " print \"Testing normalizeRows...\"\n", + " x = normalizeRows(np.array([[3.0,4.0],[1, 2]])) \n", + " # the result should be [[0.6, 0.8], [0.4472, 0.8944]]\n", + " print x\n", + " assert (x.all() == np.array([[0.6, 0.8], [0.4472, 0.8944]]).all())\n", + " print \"Ok normalizeRows !\"\n", + "\n", + "\n", + "\n", + "# -\n", + "\n", + "def gradcheck_naive(f, x):\n", + " \"\"\" \n", + " Gradient check for a function f \n", + " - f should be a function that takes a single argument and outputs the cost and its gradients\n", + " - x is the point (numpy array) to check the gradient at\n", + " \"\"\" \n", + "\n", + " rndstate = random.getstate()\n", + " random.setstate(rndstate) \n", + " fx, grad = f(x) # Evaluate function value at original point\n", + " h = 1e-4\n", + "\n", + " # Iterate over all indexes in x\n", + " it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])\n", + " while not it.finished:\n", + " ix = it.multi_index\n", + "\n", + " ### try modifying x[ix] with h defined above to compute numerical gradients\n", + " ### make sure you call random.setstate(rndstate) before calling f(x) each time, this will make it \n", + " ### possible to test cost functions with built in randomness later\n", + " ### YOUR CODE HERE:\n", + " epsilon = np.zeros(x.shape)\n", + " epsilon[ix] = h\n", + "\n", + " random.setstate(rndstate)\n", + " f1 = f(x + epsilon)\n", + "\n", + " random.setstate(rndstate)\n", + " f2 = f(x - epsilon)\n", + " \n", + " numgrad = (f1[0] - f2[0])/2/h\n", + " ### END YOUR CODE\n", + "\n", + " # Compare gradients\n", + " reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix]))\n", + " if reldiff > 1e-5:\n", + " print \"Gradient check failed.\"\n", + " print \"First gradient error found at index %s\" % str(ix)\n", + " print \"Your gradient: %f \\t Numerical gradient: %f\" % (grad[ix], numgrad)\n", + " return\n", + " \n", + " it.iternext() # Step to next dimension\n", + "\n", + " print \"Gradient check passed!\"\n", + "\n", + "\n", + "def word2vec_sgd_wrapper(word2vecModel, tokens, wordVectors, dataset, C, word2vecCostAndGradient = softmaxCostAndGradient):\n", + " batchsize = 50\n", + " cost = 0.0\n", + " grad = np.zeros(wordVectors.shape)\n", + " N = wordVectors.shape[0]\n", + " inputVectors = wordVectors[:N/2,:]\n", + " outputVectors = wordVectors[N/2:,:]\n", + " for i in xrange(batchsize):\n", + " C1 = random.randint(1,C)\n", + " centerword, context = dataset.getRandomContext(C1)\n", + " \n", + " if word2vecModel == skipgram:\n", + " denom = 1\n", + " else:\n", + " denom = 1\n", + " \n", + " c, gin, gout = word2vecModel(centerword, C1, context, tokens, inputVectors, outputVectors, dataset, word2vecCostAndGradient)\n", + " cost += c / batchsize / denom\n", + " grad[:N/2, :] += gin / batchsize / denom\n", + " grad[N/2:, :] += gout / batchsize / denom\n", + " \n", + " return cost, grad\n", + "\n", + "\n", + "\n", + "def test_word2vec():\n", + " # Interface to the dataset for negative sampling\n", + " dataset = type('dummy', (), {})()\n", + " def dummySampleTokenIdx():\n", + " return random.randint(0, 4)\n", + "\n", + " def getRandomContext(C):\n", + " tokens = [\"a\", \"b\", \"c\", \"d\", \"e\"]\n", + " return tokens[random.randint(0,4)], [tokens[random.randint(0,4)] \\\n", + " for i in xrange(2*C)]\n", + " dataset.sampleTokenIdx = dummySampleTokenIdx\n", + " dataset.getRandomContext = getRandomContext\n", + "\n", + " random.seed(31415)\n", + " np.random.seed(9265)\n", + " dummy_vectors = normalizeRows(np.random.randn(10,3))\n", + " dummy_tokens = dict([(\"a\",0), (\"b\",1), (\"c\",2),(\"d\",3),(\"e\",4)])\n", + " print \"==== Gradient check for skip-gram ====\"\n", + " gradcheck_naive(lambda vec: word2vec_sgd_wrapper(skipgram, dummy_tokens, vec, dataset, 5), dummy_vectors)\n", + " gradcheck_naive(lambda vec: word2vec_sgd_wrapper(skipgram, dummy_tokens, vec, dataset, 5, negSamplingCostAndGradient), dummy_vectors)\n", + " print \"\\n==== Gradient check for CBOW ====\"\n", + " gradcheck_naive(lambda vec: word2vec_sgd_wrapper(cbow, dummy_tokens, vec, dataset, 5), dummy_vectors)\n", + " gradcheck_naive(lambda vec: word2vec_sgd_wrapper(cbow, dummy_tokens, vec, dataset, 5, negSamplingCostAndGradient), dummy_vectors)\n", + "\n", + " print \"\\n=== Results ===\"\n", + " print skipgram(\"c\", 3, [\"a\", \"b\", \"e\", \"d\", \"b\", \"c\"], dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset)\n", + " print skipgram(\"c\", 1, [\"a\", \"b\"], dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset, negSamplingCostAndGradient)\n", + " print cbow(\"a\", 2, [\"a\", \"b\", \"c\", \"a\"], dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset)\n", + " print cbow(\"a\", 2, [\"a\", \"b\", \"a\", \"c\"], dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset, negSamplingCostAndGradient)\n", + "\n", + "\n", + "\n", + " # Interface to the dataset for negative sampling\n", + " dataset = type('dummy', (), {})()\n", + " def dummySampleTokenIdx():\n", + " return random.randint(0, 4)\n", + "\n", + " def getRandomContext(C):\n", + " tokens = [\"a\", \"b\", \"c\", \"d\", \"e\"]\n", + " return tokens[random.randint(0,4)], [tokens[random.randint(0,4)] \\\n", + " for i in xrange(2*C)]\n", + " dataset.sampleTokenIdx = dummySampleTokenIdx\n", + " dataset.getRandomContext = getRandomContext\n", + "\n", + " random.seed(31415)\n", + " np.random.seed(9265)\n", + " dummy_vectors = normalizeRows(np.random.randn(10,3))\n", + " dummy_tokens = dict([(\"a\",0), (\"b\",1), (\"c\",2),(\"d\",3),(\"e\",4)])\n", + "\n", + "\n", + "print \"hello: \" + str(dummy_tokens)\n", + "\n", + "\n", + "def softmaxCostAndGradient(predicted, target, outputVectors, dataset):\n", + " \"\"\" Softmax cost function for word2vec models \"\"\"\n", + " \n", + " # Implement the cost and gradients for one predicted word vector \n", + " # and one target word vector as a building block for word2vec \n", + " # models, assuming the softmax prediction function and cross \n", + " # entropy loss. \n", + " \n", + " # Inputs: \n", + " # - predicted: numpy ndarray, predicted word vector (\\hat{v} in \n", + " # the written component or \\hat{r} in an earlier version)\n", + " # - target: integer, the index of the target word \n", + " # - outputVectors: \"output\" vectors (as rows) for all tokens \n", + " # - dataset: needed for negative sampling, unused here. \n", + " \n", + " # Outputs: \n", + " # - cost: cross entropy cost for the softmax word prediction \n", + " # - gradPred: the gradient with respect to the predicted word \n", + " # vector \n", + " # - grad: the gradient with respect to all the other word \n", + " # vectors \n", + " \n", + " # We will not provide starter code for this function, but feel \n", + " # free to reference the code you previously wrote for this \n", + " # assignment! \n", + " \n", + " ### YOUR CODE HERE\n", + " \n", + " # Compute softmax matrix\n", + " prod = np.matmul(outputVectors, predicted)\n", + " softmax_matrix = softmax(prod)\n", + " #sumexp = np.sum(np.exp(prod)) => overflow\n", + "\n", + " # Compute cost\n", + " cost = - np.log(softmax_matrix)[target]\n", + "\n", + " \t# Compute gradients\n", + " negpart = np.matmul(softmax_matrix, outputVectors)\n", + "# print \"softmax_matrix: \" + str(softmax_matrix)\n", + "# print \"softmax_matrix shape: \" + str(softmax_matrix.shape)\n", + "# print \"outputVectors: \" + str(outputVectors)\n", + "# print \"outputVectors shape: \" + str(outputVectors.shape)\n", + "# print \"negpart: \" + str(negpart)\n", + " gradPred = - outputVectors[target] + negpart\n", + "\n", + " grad = np.outer(softmax_matrix, predicted)\n", + " grad[target] -= predicted\n", + "\n", + " ### END YOUR CODE\n", + " \n", + " return cost, gradPred, grad\n", + "\n", + "\n", + "def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, \n", + " K=10):\n", + " \"\"\" Negative sampling cost function for word2vec models \"\"\"\n", + "\n", + " # Implement the cost and gradients for one predicted word vector \n", + " # and one target word vector as a building block for word2vec \n", + " # models, using the negative sampling technique. K is the sample \n", + " # size. You might want to use dataset.sampleTokenIdx() to sample \n", + " # a random word index. \n", + " # \n", + " # Note: See test_word2vec below for dataset's initialization.\n", + " # \n", + " # Input/Output Specifications: same as softmaxCostAndGradient \n", + " # We will not provide starter code for this function, but feel \n", + " # free to reference the code you previously wrote for this \n", + " # assignment!\n", + " \n", + " ### YOUR CODE HERE\n", + "# print \"predicted: \" + str(predicted)\n", + "# print \"target: \" + str(target)\n", + "# print \"outputVectors\" + str(outputVectors)\n", + " # Sample K negative samples\n", + " neg_samples = np.zeros((K,predicted.size))\n", + " neg_sample_idx = []\n", + " for k in range(K):\n", + " idx = dataset.sampleTokenIdx()\n", + " while idx == target:\n", + " idx = dataset.sampleTokenIdx()\n", + " neg_sample_idx.append(idx)\n", + " \tneg_samples[k] = outputVectors[idx]\n", + "# print \"neg_sample_idx\" + str(neg_sample_idx)\n", + "# print \"neg_samples: \"+ str(neg_samples)\n", + "\n", + " prod = np.matmul(neg_samples, predicted)\n", + "# print \"prod: \" + str(prod)\n", + " # Compute cost\n", + " cost = - np.log(sigmoid(np.dot(outputVectors[target], predicted))) - np.sum(np.log(sigmoid(-prod)))\n", + "# print \"cost: \" + str(cost)\n", + "\n", + " \t# Compute gradients\n", + " sigm = sigmoid(-np.dot(outputVectors[target], predicted))\n", + " \n", + " negpart = np.zeros(predicted.shape)\n", + " for k in range(10):\n", + " negpart += sigmoid(prod)[k] * neg_samples[k]\n", + "# print \"negative samples:\" + str(neg_samples)\n", + "# print \"negative samples shape:\" + str(neg_samples.shape)\n", + "# print \"sigmoid(prod): \" + str(sigmoid(prod))\n", + "# print \"sigmoid(prod) shape: \" + str(sigmoid(prod).shape)\n", + "# print \"negmat: \" + str(negmat)\n", + "# print \"negmat shape: \" + str(negmat.shape)\n", + "#\tnegpart = np.sum(negmat, axis = 0)\n", + "# print \"negpart: \" + str(negpart)\n", + " gradPred = - sigm * outputVectors[target] + negpart\n", + "# print \"gradPred: \" + str(gradPred)\n", + "\n", + " grad = np.zeros(outputVectors.shape)\n", + " for i in range(K):\n", + " grad[neg_sample_idx[i]] += sigmoid(prod[i]) * predicted\n", + " grad[target] -= sigm * predicted\n", + "# print \"grad: \" + str(grad)\n", + "\n", + " ### END YOUR CODE\n", + " \n", + " return cost, gradPred, grad\n", + "\n", + "\n", + "def skipgram(currentWord, C, contextWords, tokens, inputVectors, outputVectors, \n", + " dataset, word2vecCostAndGradient = softmaxCostAndGradient):\n", + " \"\"\" Skip-gram model in word2vec \"\"\"\n", + "\n", + " # Implement the skip-gram model in this function.\n", + "\n", + " # Inputs: \n", + " # - currrentWord: a string of the current center word \n", + " # - C: integer, context size \n", + " # - contextWords: list of no more than 2*C strings, the context words \n", + " # - tokens: a dictionary that maps words to their indices in \n", + " # the word vector list \n", + " # - inputVectors: \"input\" word vectors (as rows) for all tokens \n", + " # - outputVectors: \"output\" word vectors (as rows) for all tokens \n", + " # - word2vecCostAndGradient: the cost and gradient function for \n", + " # a prediction vector given the target word vectors, \n", + " # could be one of the two cost functions you \n", + " # implemented above\n", + "\n", + " # Outputs: \n", + " # - cost: the cost function value for the skip-gram model \n", + " # - grad: the gradient with respect to the word vectors \n", + " # We will not provide starter code for this function, but feel \n", + " # free to reference the code you previously wrote for this \n", + " # assignment!\n", + "\n", + " ### YOUR CODE HERE\n", + " pred_index = tokens[currentWord]\n", + " cost = 0\n", + " gradIn = np.zeros(inputVectors.shape)\n", + " gradOut = np.zeros(outputVectors.shape)\n", + " for m in range(2*C):\n", + " \ttarget_index = tokens[contextWords[m]]\n", + " \tsingle_cost, gradPred, grad = word2vecCostAndGradient(inputVectors[pred_index], target_index, outputVectors, dataset)\n", + "# print \"cost: \" + str(single_cost)\n", + "# print \"gradPred: \" + str(gradPred)\n", + "# print \"grad: \" + str(grad)\n", + " \tcost += single_cost\n", + " \tgradIn[pred_index] += gradPred\n", + " \tgradOut += grad\n", + " ### END YOUR CODE\n", + " \n", + " return cost, gradIn, gradOut\n", + "\n", + "\n", + "\n", + "# +\n", + "#word2vec_sgd_wrapper(skipgram, dummy_tokens, dummy_vectors, dataset, 5)\n", + "#word2vec_sgd_wrapper(word2vecModel, tokens, wordVectors, dataset, C, word2vecCostAndGradient = softmaxCostAndGradient)\n", + "\n", + "batchsize = 50\n", + "cost = 0.0\n", + "grad = np.zeros(dummy_vectors.shape)\n", + "N = dummy_vectors.shape[0]\n", + "inputVectors = dummy_vectors[:N/2,:]\n", + "outputVectors = dummy_vectors[N/2:,:]\n", + "C1 = random.randint(1,5)\n", + "centerword, context = dataset.getRandomContext(C1)\n", + "c, gin, gout = skipgram(centerword, C1, context, dummy_tokens, inputVectors, outputVectors, dataset, negSamplingCostAndGradient)\n", + "print \"c: \" +str(c)\n", + "print \"gin: \" + str(gin)\n", + "print\"gout: \"+str(gout)\n", + "# -\n", + "\n", + "predicted = np.array([-0.56713774, -0.27178229, -0.77748902])\n", + "neg_samples= np.array([[ 0.18289107, 0.76098587, -0.62245591],\n", + " [-0.6831809, -0.04200519, 0.72904007],\n", + " [-0.6831809, -0.04200519, 0.72904007],\n", + " [ 0.18289107, 0.76098587, -0.62245591],\n", + " [-0.61517874, 0.5147624, -0.59713884],\n", + " [-0.52629529, -0.78190408, 0.33412466],\n", + " [-0.61517874, 0.5147624, -0.59713884],\n", + " [ 0.18289107, 0.76098587, -0.62245591],\n", + " [ 0.18289107, 0.76098587, -0.62245591],\n", + " [-0.6831809, -0.04200519, 0.72904007]])\n", + "print \"-3: \"+str(predicted.shape)\n", + "print \"-2: \"+str(neg_samples.shape)\n", + "prod = np.matmul(neg_samples, predicted)\n", + "print \"-1: \"+str(prod.shape)\n", + "negmat = np.matmul(neg_samples.T,sigmoid(prod))\n", + "print \"0: \"+str(negmat.shape)\n", + "print \"1: \"+str(negmat[0])\n", + "vec_sum = np.zeros(predicted.shape)\n", + "for k in range(10):\n", + " vec_sum += sigmoid(prod)[k] * neg_samples[k]\n", + "print \"2: \"+str(vec_sum)\n", + "\n", + "test_word2vec()\n", + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "template = json.loads(data_w_templates[200][\"templates\"])\n", + "print(template)\n", + "print(data_w_templates[200][\"script\"])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.9 ('eval-harness': conda)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "271972ab9158cd42175bc1ec5288153b91d150291a0b625c2babd1911356e891" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 6ae87c94b6dc72bdb59464f6554d0249dd29c769 Mon Sep 17 00:00:00 2001 From: loubnabnl Date: Wed, 18 Oct 2023 20:57:48 +0000 Subject: [PATCH 2/2] add noetbook --- data_analysis/notebook_analysis/scan_templates.ipynb | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/data_analysis/notebook_analysis/scan_templates.ipynb b/data_analysis/notebook_analysis/scan_templates.ipynb index 98fde8b..5b77441 100644 --- a/data_analysis/notebook_analysis/scan_templates.ipynb +++ b/data_analysis/notebook_analysis/scan_templates.ipynb @@ -28,8 +28,11 @@ "source": [ "import json\n", "import pandas as pd\n", + "from collections import Counter\n", "from datasets import load_dataset, Dataset\n", "\n", + "TEMPLATES = [\"# YOUR CODE HERE\", \"TODO: implement this function\", \"# TODO: Implement me\", \"# TODO: Write your implementation here\"]\n", + "\n", "ds = load_dataset(\"bigcode/jupyter_scripts_dedup\", split=\"train\", streaming=True)\n", "ds = list(ds.take(10_000))\n", "ds = Dataset.from_pandas(pd.DataFrame(data=ds))" @@ -49,10 +52,6 @@ } ], "source": [ - "import json\n", - "\n", - "TEMPLATES = [\"# YOUR CODE HERE\", \"TODO: implement this function\", \"# TODO: Implement me\", \"# TODO: Write your implementation here\"]\n", - "\n", "def detect_template(example):\n", " content = example[\"script\"]\n", " dict_templates = []\n", @@ -132,8 +131,6 @@ } ], "source": [ - "from collections import Counter\n", - "\n", "templates_found = [json.loads(x)[0][\"template\"] for x in data_w_templates[\"templates\"]]\n", "Counter(templates_found)" ]