From a61e837b26c970eed83356f1d592e52f50c24845 Mon Sep 17 00:00:00 2001 From: Joachim Bach Date: Thu, 2 Oct 2025 15:59:25 +0200 Subject: [PATCH] done ex1 --- PW-3/ex1/ex1-bayes-stud.ipynb | 293 +++++++++++++++++++++++++++++----- 1 file changed, 257 insertions(+), 36 deletions(-) diff --git a/PW-3/ex1/ex1-bayes-stud.ipynb b/PW-3/ex1/ex1-bayes-stud.ipynb index 4d6f60e..0b22b7b 100644 --- a/PW-3/ex1/ex1-bayes-stud.ipynb +++ b/PW-3/ex1/ex1-bayes-stud.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 468, "metadata": {}, "outputs": [], "source": [ @@ -42,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 469, "metadata": { "pycharm": { "is_executing": false @@ -58,16 +58,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 470, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " x1 x2 y\n", + "0 34.623660 78.024693 0\n", + "1 30.286711 43.894998 0\n", + "2 35.847409 72.902198 0\n", + "3 60.182599 86.308552 1\n", + "4 79.032736 75.344376 1\n" + ] + } + ], "source": [ "X_train, y_train = read_data(\"ex1-data-train.csv\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 471, "metadata": {}, "outputs": [], "source": [ @@ -85,15 +98,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 472, "metadata": { "pycharm": { "is_executing": false } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "P(C0) = 0.4, P(C1) = 0.6\n" + ] + } + ], "source": [ "# TODO: Compute the priors\n", + "\n", + "unique, counts = np.unique(y_train, return_counts=True)\n", + "\n", + "P_c0 = counts[0] / (counts[0] + counts[1])\n", + "P_c1 = counts[1] / (counts[0] + counts[1])\n", + "print(f\"P(C0) = {P_c0}, P(C1) = {P_c1}\")\n", "\n" ] }, @@ -106,27 +133,62 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 473, "metadata": { "pycharm": { "is_executing": false } }, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# TODO: Compute histograms\n", "\n", + "X_train = np.array(X_train)\n", "\n", "\n", + "x1_combined = np.column_stack((X_train[:,0], y_train))\n", + "\n", + "x1_c0 = x1_combined[y_train == 0]\n", + "x1_c1 = x1_combined[y_train == 1]\n", + "\n", + "x1_c0_hist, x1_c0_bins = np.histogram(x1_c0[:,0], bins=\"auto\")\n", + "x1_c1_hist, x1_c1_bins = np.histogram(x1_c1[:,0], bins=\"auto\")\n", + "\n", + "\n", + "x2_combined = np.column_stack((X_train[:,1], y_train))\n", + "\n", + "x2_c0 = x2_combined[y_train == 0]\n", + "x2_c1 = x2_combined[y_train == 1]\n", + "\n", + "x2_c0_hist, x2_c0_bins = np.histogram(x2_c0[:,0], bins=\"auto\")\n", + "x2_c1_hist, x2_c1_bins = np.histogram(x2_c1[:,0], bins=\"auto\")\n", + "\n", "# TODO: plot histograms\n", "\n", "plt.figure(figsize=(16,6))\n", "\n", "plt.subplot(1, 2, 1)\n", + "plt.hist(x1_c0[:,0], bins=\"auto\", alpha=0.6, label=\"failed\")\n", + "plt.hist(x1_c1[:,0], bins=\"auto\", alpha=0.6, label=\"passed\")\n", + "plt.legend()\n", "...\n", "plt.xlabel('Likelihood hist - Exam 1')\n", "\n", "plt.subplot(1, 2, 2)\n", + "plt.hist(x2_c0[:,0], bins=\"auto\", alpha=0.6, label=\"failed\")\n", + "plt.hist(x2_c1[:,0], bins=\"auto\", alpha=0.6, label=\"passed\")\n", + "plt.legend()\n", "...\n", "plt.xlabel('Likelihood hist - Exam 2')\n", "\n", @@ -142,7 +204,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 474, "metadata": { "pycharm": { "is_executing": false @@ -153,7 +215,38 @@ "def likelihood_hist(x: float, hist_values: np.ndarray, bin_edges: np.ndarray) -> float:\n", " # TODO: compute likelihoods from histograms outputs\n", "\n", - " return ..." + " bin_index = np.digitize(x, bin_edges) - 1\n", + "\n", + " if bin_index >= len(hist_values) or bin_index < 0:\n", + " return 0\n", + "\n", + " # print(f\"edges = {bin_edges}\")\n", + " # print(f\"values = {hist_values}\")\n", + " # print(f\"selected bin = {bin_index}\")\n", + "\n", + " count = hist_values[bin_index]\n", + " return count/np.sum(hist_values)" + ] + }, + { + "cell_type": "code", + "execution_count": 475, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "np.float64(0.4)" + ] + }, + "execution_count": 475, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hist, edges = np.histogram(x2_y0[:,0], bins=\"auto\")\n", + "likelihood_hist(45, hist, edges)" ] }, { @@ -168,34 +261,101 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 476, "metadata": { "pycharm": { "is_executing": false } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " x1 x2 y\n", + "0 39.196334 78.530294 0\n", + "1 40.448499 86.839470 1\n", + "2 65.571920 44.303497 0\n", + "3 79.648113 70.806564 1\n", + "4 66.260221 41.672703 0\n" + ] + } + ], "source": [ "X_test, y_test = read_data(\"ex1-data-test.csv\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 477, "metadata": { "pycharm": { "is_executing": false } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pred accuracy with x1 0.64\n", + "Pred accuracy with x2 0.72\n", + "Pred accuracy with x1 and x2 0.87\n" + ] + } + ], "source": [ "# TODO: predict on test set in the 3 cases described above\n", "\n", - "y_pred = []\n", + "y_pred_x1 = []\n", "\n", - "...\n", + "X_test = np.array(X_test)\n", "\n", - "accuracy_score(y_test, y_pred)" + "for i in range(len(X_test)):\n", + " P_c0_x1 = likelihood_hist(X_test[i,0], x1_c0_hist, x1_c0_bins)*P_c0\n", + " P_c1_x1 = likelihood_hist(X_test[i,0], x1_c1_hist, x1_c1_bins)*P_c1\n", + "\n", + " if P_c0_x1 > P_c1_x1:\n", + " y_pred_x1.append(0)\n", + " else:\n", + " y_pred_x1.append(1)\n", + "\n", + "print(f\"Pred accuracy with x1 {accuracy_score(y_test, y_pred_x1)}\")\n", + "\n", + "\n", + "\n", + "y_pred_x2 = []\n", + "\n", + "for i in range(len(X_test)):\n", + " P_c0_x2 = likelihood_hist(X_test[i,1], x2_c0_hist, x2_c0_bins)*P_c0\n", + " P_c1_x2 = likelihood_hist(X_test[i,1], x2_c1_hist, x2_c1_bins)*P_c1\n", + "\n", + " if P_c0_x2 > P_c1_x2:\n", + " y_pred_x2.append(0)\n", + " else:\n", + " y_pred_x2.append(1)\n", + "\n", + "print(f\"Pred accuracy with x2 {accuracy_score(y_test, y_pred_x2)}\")\n", + "\n", + "\n", + "\n", + "\n", + "y_pred_x1_x2 = []\n", + "\n", + "for i in range(len(X_test)):\n", + "# for i in range(0,1):\n", + " P_c0_x1 = likelihood_hist(X_test[i,0], x1_c0_hist, x1_c0_bins)*P_c0\n", + " P_c1_x1 = likelihood_hist(X_test[i,0], x1_c1_hist, x1_c1_bins)*P_c1\n", + " P_c0_x2 = likelihood_hist(X_test[i,1], x2_c0_hist, x2_c0_bins)*P_c0\n", + " P_c1_x2 = likelihood_hist(X_test[i,1], x2_c1_hist, x2_c1_bins)*P_c1\n", + "\n", + " if P_c0_x2*P_c0_x1 > P_c1_x1*P_c1_x2:\n", + " y_pred_x1_x2.append(0)\n", + " else:\n", + " y_pred_x1_x2.append(1)\n", + "\n", + "print(f\"Pred accuracy with x1 and x2 {accuracy_score(y_test, y_pred_x1_x2)}\")\n", + "\n" ] }, { @@ -205,13 +365,6 @@ "outputs": [], "source": [] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "metadata": {}, @@ -223,7 +376,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "TODO: answer" + "The system with both variables is clearly the better one and this makes sense, because the chance of success of a student is clearly related to the performance of both exams, not just one. The system with one variable only can only make better decisions when the student has really highly failed or succeded an exam, which would mean the second variable could be less important. For example, if one has 0 on the first exam, it is really not probable that he will pass, even if we don't know the second grade.\n", + "\n", + "All in all, more variables in this case mean a better approximation." ] }, { @@ -242,7 +397,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 478, "metadata": { "pycharm": { "is_executing": false @@ -253,12 +408,15 @@ "def likelihood_univariate_gaussian(x: float, mean: float, var: float) -> float:\n", " # TODO: compute likelihoods from histograms outputs\n", "\n", - " return ..." + " sqrt = np.sqrt(2*np.pi*var)\n", + " exp = np.exp(-(1/(2*var)) * (x - mean)**2)\n", + "\n", + " return (1/sqrt) * exp" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 479, "metadata": { "pycharm": { "is_executing": false @@ -267,22 +425,85 @@ "outputs": [], "source": [ "# TODO: Compute mean and variance for each classes and each features (8 values)\n", + "\n", + "mean_x1_c0 = np.mean(X_train[:,0][y_train == 0])\n", + "mean_x1_c1 = np.mean(X_train[:,0][y_train == 1])\n", + "\n", + "mean_x2_c0 = np.mean(X_train[:,1][y_train == 0])\n", + "mean_x2_c1 = np.mean(X_train[:,1][y_train == 1])\n", + "\n", + "var_x1_c0 = np.var(X_train[:,0][y_train == 0])\n", + "var_x1_c1 = np.var(X_train[:,0][y_train == 1])\n", + "\n", + "var_x2_c0 = np.var(X_train[:,1][y_train == 0])\n", + "var_x2_c1 = np.var(X_train[:,1][y_train == 1])\n", + "\n", + "\n", "\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 480, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pred accuracy with x1 0.71\n", + "Pred accuracy with x2 0.72\n", + "Pred accuracy with x1 and x2 0.92\n" + ] + } + ], "source": [ "# TODO: predict on test set in the 3 cases\n", "\n", - "y_pred = []\n", + "y_pred_x1 = []\n", "\n", - "...\n", + "for i in range(len(X_test)):\n", + " P_c0_x1 = likelihood_univariate_gaussian(X_test[i,0], mean_x1_c0, var_x1_c0)*P_c0\n", + " P_c1_x1 = likelihood_univariate_gaussian(X_test[i,0], mean_x1_c1, var_x1_c1)*P_c1\n", "\n", - "accuracy_score(y_test, y_pred)" + " if P_c0_x1 > P_c1_x1:\n", + " y_pred_x1.append(0)\n", + " else:\n", + " y_pred_x1.append(1)\n", + "\n", + "print(f\"Pred accuracy with x1 {accuracy_score(y_test, y_pred_x1)}\")\n", + "\n", + "y_pred_x2 = []\n", + "\n", + "for i in range(len(X_test)):\n", + " P_c0_x2 = likelihood_univariate_gaussian(X_test[i,1], mean_x2_c0, var_x2_c0)*P_c0\n", + " P_c1_x2 = likelihood_univariate_gaussian(X_test[i,1], mean_x2_c1, var_x2_c1)*P_c1\n", + "\n", + " if P_c0_x2 > P_c1_x2:\n", + " y_pred_x2.append(0)\n", + " else:\n", + " y_pred_x2.append(1)\n", + "\n", + "print(f\"Pred accuracy with x2 {accuracy_score(y_test, y_pred_x2)}\")\n", + "\n", + "\n", + "\n", + "\n", + "y_pred_x1_x2 = []\n", + "\n", + "for i in range(len(X_test)):\n", + "# for i in range(0,1):\n", + " P_c0_x1 = likelihood_univariate_gaussian(X_test[i,0], mean_x1_c0, var_x1_c0)*P_c0\n", + " P_c1_x1 = likelihood_univariate_gaussian(X_test[i,0], mean_x1_c1, var_x1_c1)*P_c1\n", + " P_c0_x2 = likelihood_univariate_gaussian(X_test[i,1], mean_x2_c0, var_x2_c0)*P_c0\n", + " P_c1_x2 = likelihood_univariate_gaussian(X_test[i,1], mean_x2_c1, var_x2_c1)*P_c1\n", + "\n", + " if P_c0_x2*P_c0_x1 > P_c1_x1*P_c1_x2:\n", + " y_pred_x1_x2.append(0)\n", + " else:\n", + " y_pred_x1_x2.append(1)\n", + "\n", + "print(f\"Pred accuracy with x1 and x2 {accuracy_score(y_test, y_pred_x1_x2)}\")\n" ] }, { @@ -302,7 +523,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -316,7 +537,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.7" + "version": "3.12.3" }, "pycharm": { "stem_cell": {