Started PW-2

This commit is contained in:
gabriel.marinoja
2025-09-23 13:18:25 +02:00
commit f0e1453d13
9 changed files with 13088 additions and 0 deletions

View File

@@ -0,0 +1,28 @@
Country,GDP per capita (USD),Life satisfaction
Russia,26456.3879381321,5.8
Greece,27287.0834009302,5.4
Turkey,28384.9877846263,5.5
Latvia,29932.4939100562,5.9
Hungary,31007.7684065437,5.6
Portugal,32181.1545372343,5.4
Poland,32238.157259275,6.1
Estonia,35638.4213511812,5.7
Spain,36215.4475907307,6.3
Slovenia,36547.7389559849,5.9
Lithuania,36732.034744031,5.9
Israel,38341.3075704083,7.2
Italy,38992.1483807498,6.0
United Kingdom,41627.129269425,6.8
France,42025.6173730617,6.5
New Zealand,42404.3937381567,7.3
Canada,45856.6256264804,7.4
Finland,47260.800458441,7.6
Belgium,48210.0331113444,6.9
Australia,48697.8370282475,7.3
Sweden,50683.3235097178,7.3
Germany,50922.3580234484,7.0
Austria,51935.6038618156,7.1
Iceland,52279.7288513646,7.5
Netherlands,54209.5638357302,7.4
Denmark,55938.2128086032,7.6
United States,60235.7284916969,6.9
1 Country GDP per capita (USD) Life satisfaction
2 Russia 26456.3879381321 5.8
3 Greece 27287.0834009302 5.4
4 Turkey 28384.9877846263 5.5
5 Latvia 29932.4939100562 5.9
6 Hungary 31007.7684065437 5.6
7 Portugal 32181.1545372343 5.4
8 Poland 32238.157259275 6.1
9 Estonia 35638.4213511812 5.7
10 Spain 36215.4475907307 6.3
11 Slovenia 36547.7389559849 5.9
12 Lithuania 36732.034744031 5.9
13 Israel 38341.3075704083 7.2
14 Italy 38992.1483807498 6.0
15 United Kingdom 41627.129269425 6.8
16 France 42025.6173730617 6.5
17 New Zealand 42404.3937381567 7.3
18 Canada 45856.6256264804 7.4
19 Finland 47260.800458441 7.6
20 Belgium 48210.0331113444 6.9
21 Australia 48697.8370282475 7.3
22 Sweden 50683.3235097178 7.3
23 Germany 50922.3580234484 7.0
24 Austria 51935.6038618156 7.1
25 Iceland 52279.7288513646 7.5
26 Netherlands 54209.5638357302 7.4
27 Denmark 55938.2128086032 7.6
28 United States 60235.7284916969 6.9

View File

@@ -0,0 +1,258 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "b94b0451",
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.linear_model import LinearRegression\n",
"\n",
"# Download and prepare the data\n",
"lifesat = pd.read_csv(\"lifesat.csv\")\n",
"X = lifesat[[\"GDP per capita (USD)\"]].values\n",
"y = lifesat[[\"Life satisfaction\"]].values\n",
"\n",
"# Visualize the data\n",
"lifesat.plot(kind='scatter', grid=True,\n",
" x=\"GDP per capita (USD)\", y=\"Life satisfaction\")\n",
"plt.axis([23_500, 62_500, 4, 9])\n",
"plt.show()\n",
"\n",
"# Select a linear model\n",
"model = LinearRegression()\n",
"\n",
"# Train the model\n",
"model.fit(X, y)\n",
"\n",
"# Make a prediction for Cyprus\n",
"X_new = [[37_655.2]] # Cyprus' GDP per capita in 2020\n",
"print(model.predict(X_new)) # outputs [[6.30165767]]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "94fda07f",
"metadata": {},
"outputs": [],
"source": [
"X_test = np.linspace(25000, 60000, 200)\n",
"X_test = [[value] for value in X_test]\n",
"y_test = model.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "838b0242",
"metadata": {},
"outputs": [],
"source": [
"# Visualize the data\n",
"lifesat.plot(kind='scatter', grid=True,\n",
" x=\"GDP per capita (USD)\", y=\"Life satisfaction\")\n",
"plt.axis([23_500, 62_500, 4, 9])\n",
"plt.plot(X_test, y_test, color='red')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aa14a4ca",
"metadata": {},
"outputs": [],
"source": [
"class KNearestNeighborRegressor(object):\n",
" \"\"\" a kNN regressor with L2 distance \"\"\"\n",
"\n",
" def __init__(self):\n",
" pass\n",
"\n",
" def train(self, X, y):\n",
" \"\"\"\n",
" Train the classifier. For k-nearest neighbors this is just \n",
" memorizing the training data.\n",
"\n",
" Inputs:\n",
" - X: A numpy array of shape (num_train, D) containing the training data\n",
" consisting of num_train samples each of dimension D.\n",
" - y: A numpy array of shape (N,) containing the training labels, where\n",
" y[i] is the label for X[i].\n",
" \"\"\"\n",
" self.X_train = X\n",
" self.y_train = y\n",
" \n",
" def predict(self, X, k=1):\n",
" \"\"\"\n",
" Predict labels for test data using this classifier.\n",
"\n",
" Inputs:\n",
" - X: A numpy array of shape (num_test, D) containing test data consisting\n",
" of num_test samples each of dimension D.\n",
" - k: The number of nearest neighbors that vote for the predicted labels.\n",
" - num_loops: Determines which implementation to use to compute distances\n",
" between training points and testing points.\n",
"\n",
" Returns:\n",
" - y: A numpy array of shape (num_test,) containing predicted labels for the\n",
" test data, where y[i] is the predicted label for the test point X[i]. \n",
" \"\"\"\n",
" dists = self.compute_distances(X)\n",
" \n",
" return self.predict_values(dists, k=k)\n",
"\n",
"\n",
" def compute_distances(self, X):\n",
" \"\"\"\n",
" Compute the distance between each test point in X and each training point\n",
" in self.X_train using a single loop over the test data.\n",
"\n",
" Inputs:\n",
" - X: A numpy array of shape (num_test, D) containing test data.\n",
"\n",
" Returns:\n",
" - dists: A numpy array of shape (num_test, num_train) where dists[i, j]\n",
" is the Euclidean distance between the ith test point and the jth training\n",
" point.\n",
" \"\"\"\n",
" num_test = X.shape[0]\n",
" num_train = self.X_train.shape[0]\n",
" dists = np.zeros((num_test, num_train))\n",
" for i in range(num_test):\n",
" #######################################################################\n",
" # TODO: #\n",
" # Compute the l2 distance between the ith test point and all training #\n",
" # points, and store the result in dists[i, :]. #\n",
" #######################################################################\n",
" \n",
" pass\n",
" \n",
" #######################################################################\n",
" # END OF YOUR CODE #\n",
" #######################################################################\n",
" return dists\n",
"\n",
"\n",
"\n",
" def predict_values(self, dists, k=1):\n",
" \"\"\"\n",
" Given a matrix of distances between test points and training points,\n",
" predict a value for each test point.\n",
"\n",
" Inputs:\n",
" - dists: A numpy array of shape (num_test, num_train) where dists[i, j]\n",
" gives the distance betwen the ith test point and the jth training point.\n",
"\n",
" Returns:\n",
" - y: A numpy array of shape (num_test,) containing predicted values for the\n",
" test data, where y[i] is the predicted value for the test point X[i]. \n",
" \"\"\"\n",
" num_test = dists.shape[0]\n",
" y_pred = np.zeros(num_test)\n",
" for i in range(num_test):\n",
" # A list of length k storing the labels of the k nearest neighbors to\n",
" # the ith test point.\n",
" closest_y = []\n",
" \n",
" #########################################################################\n",
" # TODO: #\n",
" # Use the distance matrix to find the k nearest neighbors of the ith #\n",
" # testing point, and use self.y_train to find the labels of these #\n",
" # neighbors. Store these labels in closest_y. #\n",
" # Hint: Look up the function numpy.argsort. #\n",
" #########################################################################\n",
" \n",
" pass\n",
" \n",
" #########################################################################\n",
" # TODO: #\n",
" # Now that you have found the labels of the k nearest neighbors, you #\n",
" # need to compute the average of the target values corresponding to the #\n",
" # nearest neighbors. #\n",
" #########################################################################\n",
" \n",
" pass\n",
" \n",
" #########################################################################\n",
" # END OF YOUR CODE # \n",
" #########################################################################\n",
"\n",
" return y_pred"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "267d1168",
"metadata": {},
"outputs": [],
"source": [
"knn_reg = KNearestNeighborRegressor()\n",
"knn_reg.train(np.array(X), y)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fd8203ba",
"metadata": {},
"outputs": [],
"source": [
"y_hat_1 = knn_reg.predict(np.array(X_test), k=1)\n",
"y_hat_3 = knn_reg.predict(np.array(X_test), k=3)\n",
"y_hat_5 = knn_reg.predict(np.array(X_test), k=5)\n",
"y_hat_7 = knn_reg.predict(np.array(X_test), k=7)\n",
"y_hat_20 = knn_reg.predict(np.array(X_test), k=20)\n",
"y_hat_27 = knn_reg.predict(np.array(X_test), k=27)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d3704256",
"metadata": {},
"outputs": [],
"source": [
"# Visualize the data\n",
"lifesat.plot(kind='scatter', grid=True,\n",
" x=\"GDP per capita (USD)\", y=\"Life satisfaction\")\n",
"plt.axis([23_500, 62_500, 4, 9])\n",
"plt.plot(X_test, y_test, color='red')\n",
"plt.plot(X_test, y_hat_1, color='green')\n",
"# plt.plot(X_test, y_hat_3, color='blue')\n",
"# plt.plot(X_test, y_hat_5, color='magenta')\n",
"# plt.plot(X_test, y_hat_7, color='orange')\n",
"# plt.plot(X_test, y_hat_20, color='black')\n",
"# plt.plot(X_test, y_hat_27, color='grey')\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}