{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "b94b0451", "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.linear_model import LinearRegression\n", "\n", "# Download and prepare the data\n", "lifesat = pd.read_csv(\"lifesat.csv\")\n", "X = lifesat[[\"GDP per capita (USD)\"]].values\n", "y = lifesat[[\"Life satisfaction\"]].values\n", "\n", "# Visualize the data\n", "lifesat.plot(kind='scatter', grid=True,\n", " x=\"GDP per capita (USD)\", y=\"Life satisfaction\")\n", "plt.axis([23_500, 62_500, 4, 9])\n", "plt.show()\n", "\n", "# Select a linear model\n", "model = LinearRegression()\n", "\n", "# Train the model\n", "model.fit(X, y)\n", "\n", "# Make a prediction for Cyprus\n", "X_new = [[37_655.2]] # Cyprus' GDP per capita in 2020\n", "print(model.predict(X_new)) # outputs [[6.30165767]]\n" ] }, { "cell_type": "code", "execution_count": null, "id": "94fda07f", "metadata": {}, "outputs": [], "source": [ "X_test = np.linspace(25000, 60000, 200)\n", "X_test = [[value] for value in X_test]\n", "y_test = model.predict(X_test)" ] }, { "cell_type": "code", "execution_count": null, "id": "838b0242", "metadata": {}, "outputs": [], "source": [ "# Visualize the data\n", "lifesat.plot(kind='scatter', grid=True,\n", " x=\"GDP per capita (USD)\", y=\"Life satisfaction\")\n", "plt.axis([23_500, 62_500, 4, 9])\n", "plt.plot(X_test, y_test, color='red')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "aa14a4ca", "metadata": {}, "outputs": [], "source": [ "class KNearestNeighborRegressor(object):\n", " \"\"\" a kNN regressor with L2 distance \"\"\"\n", "\n", " def __init__(self):\n", " pass\n", "\n", " def train(self, X, y):\n", " \"\"\"\n", " Train the classifier. For k-nearest neighbors this is just \n", " memorizing the training data.\n", "\n", " Inputs:\n", " - X: A numpy array of shape (num_train, D) containing the training data\n", " consisting of num_train samples each of dimension D.\n", " - y: A numpy array of shape (N,) containing the training labels, where\n", " y[i] is the label for X[i].\n", " \"\"\"\n", " self.X_train = X\n", " self.y_train = y\n", " \n", " def predict(self, X, k=1):\n", " \"\"\"\n", " Predict labels for test data using this classifier.\n", "\n", " Inputs:\n", " - X: A numpy array of shape (num_test, D) containing test data consisting\n", " of num_test samples each of dimension D.\n", " - k: The number of nearest neighbors that vote for the predicted labels.\n", " - num_loops: Determines which implementation to use to compute distances\n", " between training points and testing points.\n", "\n", " Returns:\n", " - y: A numpy array of shape (num_test,) containing predicted labels for the\n", " test data, where y[i] is the predicted label for the test point X[i]. \n", " \"\"\"\n", " dists = self.compute_distances(X)\n", " \n", " return self.predict_values(dists, k=k)\n", "\n", "\n", " def compute_distances(self, X):\n", " \"\"\"\n", " Compute the distance between each test point in X and each training point\n", " in self.X_train using a single loop over the test data.\n", "\n", " Inputs:\n", " - X: A numpy array of shape (num_test, D) containing test data.\n", "\n", " Returns:\n", " - dists: A numpy array of shape (num_test, num_train) where dists[i, j]\n", " is the Euclidean distance between the ith test point and the jth training\n", " point.\n", " \"\"\"\n", " num_test = X.shape[0]\n", " num_train = self.X_train.shape[0]\n", " dists = np.zeros((num_test, num_train))\n", " for i in range(num_test):\n", " #######################################################################\n", " # TODO: #\n", " # Compute the l2 distance between the ith test point and all training #\n", " # points, and store the result in dists[i, :]. #\n", " #######################################################################\n", " \n", " pass\n", " \n", " #######################################################################\n", " # END OF YOUR CODE #\n", " #######################################################################\n", " return dists\n", "\n", "\n", "\n", " def predict_values(self, dists, k=1):\n", " \"\"\"\n", " Given a matrix of distances between test points and training points,\n", " predict a value for each test point.\n", "\n", " Inputs:\n", " - dists: A numpy array of shape (num_test, num_train) where dists[i, j]\n", " gives the distance betwen the ith test point and the jth training point.\n", "\n", " Returns:\n", " - y: A numpy array of shape (num_test,) containing predicted values for the\n", " test data, where y[i] is the predicted value for the test point X[i]. \n", " \"\"\"\n", " num_test = dists.shape[0]\n", " y_pred = np.zeros(num_test)\n", " for i in range(num_test):\n", " # A list of length k storing the labels of the k nearest neighbors to\n", " # the ith test point.\n", " closest_y = []\n", " \n", " #########################################################################\n", " # TODO: #\n", " # Use the distance matrix to find the k nearest neighbors of the ith #\n", " # testing point, and use self.y_train to find the labels of these #\n", " # neighbors. Store these labels in closest_y. #\n", " # Hint: Look up the function numpy.argsort. #\n", " #########################################################################\n", " \n", " pass\n", " \n", " #########################################################################\n", " # TODO: #\n", " # Now that you have found the labels of the k nearest neighbors, you #\n", " # need to compute the average of the target values corresponding to the #\n", " # nearest neighbors. #\n", " #########################################################################\n", " \n", " pass\n", " \n", " #########################################################################\n", " # END OF YOUR CODE # \n", " #########################################################################\n", "\n", " return y_pred" ] }, { "cell_type": "code", "execution_count": null, "id": "267d1168", "metadata": {}, "outputs": [], "source": [ "knn_reg = KNearestNeighborRegressor()\n", "knn_reg.train(np.array(X), y)" ] }, { "cell_type": "code", "execution_count": null, "id": "fd8203ba", "metadata": {}, "outputs": [], "source": [ "y_hat_1 = knn_reg.predict(np.array(X_test), k=1)\n", "y_hat_3 = knn_reg.predict(np.array(X_test), k=3)\n", "y_hat_5 = knn_reg.predict(np.array(X_test), k=5)\n", "y_hat_7 = knn_reg.predict(np.array(X_test), k=7)\n", "y_hat_20 = knn_reg.predict(np.array(X_test), k=20)\n", "y_hat_27 = knn_reg.predict(np.array(X_test), k=27)" ] }, { "cell_type": "code", "execution_count": null, "id": "d3704256", "metadata": {}, "outputs": [], "source": [ "# Visualize the data\n", "lifesat.plot(kind='scatter', grid=True,\n", " x=\"GDP per capita (USD)\", y=\"Life satisfaction\")\n", "plt.axis([23_500, 62_500, 4, 9])\n", "plt.plot(X_test, y_test, color='red')\n", "plt.plot(X_test, y_hat_1, color='green')\n", "# plt.plot(X_test, y_hat_3, color='blue')\n", "# plt.plot(X_test, y_hat_5, color='magenta')\n", "# plt.plot(X_test, y_hat_7, color='orange')\n", "# plt.plot(X_test, y_hat_20, color='black')\n", "# plt.plot(X_test, y_hat_27, color='grey')\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.13" } }, "nbformat": 4, "nbformat_minor": 5 }