259 lines
9.3 KiB
Plaintext
259 lines
9.3 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b94b0451",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"from sklearn.linear_model import LinearRegression\n",
|
|
"\n",
|
|
"# Download and prepare the data\n",
|
|
"lifesat = pd.read_csv(\"lifesat.csv\")\n",
|
|
"X = lifesat[[\"GDP per capita (USD)\"]].values\n",
|
|
"y = lifesat[[\"Life satisfaction\"]].values\n",
|
|
"\n",
|
|
"# Visualize the data\n",
|
|
"lifesat.plot(kind='scatter', grid=True,\n",
|
|
" x=\"GDP per capita (USD)\", y=\"Life satisfaction\")\n",
|
|
"plt.axis([23_500, 62_500, 4, 9])\n",
|
|
"plt.show()\n",
|
|
"\n",
|
|
"# Select a linear model\n",
|
|
"model = LinearRegression()\n",
|
|
"\n",
|
|
"# Train the model\n",
|
|
"model.fit(X, y)\n",
|
|
"\n",
|
|
"# Make a prediction for Cyprus\n",
|
|
"X_new = [[37_655.2]] # Cyprus' GDP per capita in 2020\n",
|
|
"print(model.predict(X_new)) # outputs [[6.30165767]]\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "94fda07f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"X_test = np.linspace(25000, 60000, 200)\n",
|
|
"X_test = [[value] for value in X_test]\n",
|
|
"y_test = model.predict(X_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "838b0242",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Visualize the data\n",
|
|
"lifesat.plot(kind='scatter', grid=True,\n",
|
|
" x=\"GDP per capita (USD)\", y=\"Life satisfaction\")\n",
|
|
"plt.axis([23_500, 62_500, 4, 9])\n",
|
|
"plt.plot(X_test, y_test, color='red')\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "aa14a4ca",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"class KNearestNeighborRegressor(object):\n",
|
|
" \"\"\" a kNN regressor with L2 distance \"\"\"\n",
|
|
"\n",
|
|
" def __init__(self):\n",
|
|
" pass\n",
|
|
"\n",
|
|
" def train(self, X, y):\n",
|
|
" \"\"\"\n",
|
|
" Train the classifier. For k-nearest neighbors this is just \n",
|
|
" memorizing the training data.\n",
|
|
"\n",
|
|
" Inputs:\n",
|
|
" - X: A numpy array of shape (num_train, D) containing the training data\n",
|
|
" consisting of num_train samples each of dimension D.\n",
|
|
" - y: A numpy array of shape (N,) containing the training labels, where\n",
|
|
" y[i] is the label for X[i].\n",
|
|
" \"\"\"\n",
|
|
" self.X_train = X\n",
|
|
" self.y_train = y\n",
|
|
" \n",
|
|
" def predict(self, X, k=1):\n",
|
|
" \"\"\"\n",
|
|
" Predict labels for test data using this classifier.\n",
|
|
"\n",
|
|
" Inputs:\n",
|
|
" - X: A numpy array of shape (num_test, D) containing test data consisting\n",
|
|
" of num_test samples each of dimension D.\n",
|
|
" - k: The number of nearest neighbors that vote for the predicted labels.\n",
|
|
" - num_loops: Determines which implementation to use to compute distances\n",
|
|
" between training points and testing points.\n",
|
|
"\n",
|
|
" Returns:\n",
|
|
" - y: A numpy array of shape (num_test,) containing predicted labels for the\n",
|
|
" test data, where y[i] is the predicted label for the test point X[i]. \n",
|
|
" \"\"\"\n",
|
|
" dists = self.compute_distances(X)\n",
|
|
" \n",
|
|
" return self.predict_values(dists, k=k)\n",
|
|
"\n",
|
|
"\n",
|
|
" def compute_distances(self, X):\n",
|
|
" \"\"\"\n",
|
|
" Compute the distance between each test point in X and each training point\n",
|
|
" in self.X_train using a single loop over the test data.\n",
|
|
"\n",
|
|
" Inputs:\n",
|
|
" - X: A numpy array of shape (num_test, D) containing test data.\n",
|
|
"\n",
|
|
" Returns:\n",
|
|
" - dists: A numpy array of shape (num_test, num_train) where dists[i, j]\n",
|
|
" is the Euclidean distance between the ith test point and the jth training\n",
|
|
" point.\n",
|
|
" \"\"\"\n",
|
|
" num_test = X.shape[0]\n",
|
|
" num_train = self.X_train.shape[0]\n",
|
|
" dists = np.zeros((num_test, num_train))\n",
|
|
" for i in range(num_test):\n",
|
|
" #######################################################################\n",
|
|
" # TODO: #\n",
|
|
" # Compute the l2 distance between the ith test point and all training #\n",
|
|
" # points, and store the result in dists[i, :]. #\n",
|
|
" #######################################################################\n",
|
|
" \n",
|
|
" pass\n",
|
|
" \n",
|
|
" #######################################################################\n",
|
|
" # END OF YOUR CODE #\n",
|
|
" #######################################################################\n",
|
|
" return dists\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
" def predict_values(self, dists, k=1):\n",
|
|
" \"\"\"\n",
|
|
" Given a matrix of distances between test points and training points,\n",
|
|
" predict a value for each test point.\n",
|
|
"\n",
|
|
" Inputs:\n",
|
|
" - dists: A numpy array of shape (num_test, num_train) where dists[i, j]\n",
|
|
" gives the distance betwen the ith test point and the jth training point.\n",
|
|
"\n",
|
|
" Returns:\n",
|
|
" - y: A numpy array of shape (num_test,) containing predicted values for the\n",
|
|
" test data, where y[i] is the predicted value for the test point X[i]. \n",
|
|
" \"\"\"\n",
|
|
" num_test = dists.shape[0]\n",
|
|
" y_pred = np.zeros(num_test)\n",
|
|
" for i in range(num_test):\n",
|
|
" # A list of length k storing the labels of the k nearest neighbors to\n",
|
|
" # the ith test point.\n",
|
|
" closest_y = []\n",
|
|
" \n",
|
|
" #########################################################################\n",
|
|
" # TODO: #\n",
|
|
" # Use the distance matrix to find the k nearest neighbors of the ith #\n",
|
|
" # testing point, and use self.y_train to find the labels of these #\n",
|
|
" # neighbors. Store these labels in closest_y. #\n",
|
|
" # Hint: Look up the function numpy.argsort. #\n",
|
|
" #########################################################################\n",
|
|
" \n",
|
|
" pass\n",
|
|
" \n",
|
|
" #########################################################################\n",
|
|
" # TODO: #\n",
|
|
" # Now that you have found the labels of the k nearest neighbors, you #\n",
|
|
" # need to compute the average of the target values corresponding to the #\n",
|
|
" # nearest neighbors. #\n",
|
|
" #########################################################################\n",
|
|
" \n",
|
|
" pass\n",
|
|
" \n",
|
|
" #########################################################################\n",
|
|
" # END OF YOUR CODE # \n",
|
|
" #########################################################################\n",
|
|
"\n",
|
|
" return y_pred"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "267d1168",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"knn_reg = KNearestNeighborRegressor()\n",
|
|
"knn_reg.train(np.array(X), y)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "fd8203ba",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"y_hat_1 = knn_reg.predict(np.array(X_test), k=1)\n",
|
|
"y_hat_3 = knn_reg.predict(np.array(X_test), k=3)\n",
|
|
"y_hat_5 = knn_reg.predict(np.array(X_test), k=5)\n",
|
|
"y_hat_7 = knn_reg.predict(np.array(X_test), k=7)\n",
|
|
"y_hat_20 = knn_reg.predict(np.array(X_test), k=20)\n",
|
|
"y_hat_27 = knn_reg.predict(np.array(X_test), k=27)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d3704256",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Visualize the data\n",
|
|
"lifesat.plot(kind='scatter', grid=True,\n",
|
|
" x=\"GDP per capita (USD)\", y=\"Life satisfaction\")\n",
|
|
"plt.axis([23_500, 62_500, 4, 9])\n",
|
|
"plt.plot(X_test, y_test, color='red')\n",
|
|
"plt.plot(X_test, y_hat_1, color='green')\n",
|
|
"# plt.plot(X_test, y_hat_3, color='blue')\n",
|
|
"# plt.plot(X_test, y_hat_5, color='magenta')\n",
|
|
"# plt.plot(X_test, y_hat_7, color='orange')\n",
|
|
"# plt.plot(X_test, y_hat_20, color='black')\n",
|
|
"# plt.plot(X_test, y_hat_27, color='grey')\n",
|
|
"plt.show()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.13"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|