Compare commits
11 Commits
492686df00
...
14cf888160
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
14cf888160 | ||
|
|
d23d2a51c4 | ||
|
|
8ec28596c0 | ||
|
|
c360807ab5 | ||
|
|
770d6678ce | ||
|
|
ec48bb1546 | ||
|
|
aeb97473ea | ||
|
|
a61e837b26 | ||
|
|
a71160f863 | ||
|
|
b72020d0f7 | ||
|
|
a31eea177c |
324
PW-2/ex5-regression-knn/regression-knn-stud_Charline.ipynb
Normal file
324
PW-2/ex5-regression-knn/regression-knn-stud_Charline.ipynb
Normal file
File diff suppressed because one or more lines are too long
554
PW-3/ex1/ex1-bayes-stud.ipynb
Normal file
554
PW-3/ex1/ex1-bayes-stud.ipynb
Normal file
File diff suppressed because one or more lines are too long
100
PW-3/ex1/ex1-data-test.csv
Normal file
100
PW-3/ex1/ex1-data-test.csv
Normal file
@@ -0,0 +1,100 @@
|
||||
39.1963341568658,78.53029405902203,0
|
||||
40.448499233673424,86.83946993295656,1
|
||||
65.57192032694599,44.303496565835594,0
|
||||
79.64811329486565,70.8065641864705,1
|
||||
66.26022052135889,41.67270317074954,0
|
||||
97.6637443782087,68.3249232452966,1
|
||||
30.548823788843436,57.31847952965393,0
|
||||
89.47322095778219,85.94680780258534,1
|
||||
50.93087801180052,34.2357678392285,0
|
||||
39.79292275937423,83.42467462939659,1
|
||||
47.45440952767612,43.40242137611206,0
|
||||
69.97497171303611,84.4084067760751,1
|
||||
66.57906119077748,42.13570922437346,0
|
||||
85.05872976046471,54.31025004023918,1
|
||||
66.50445545099684,46.515380367647104,0
|
||||
75.67274744410004,93.79012528285647,1
|
||||
30.589637766842877,71.58841488039977,0
|
||||
43.2174833244174,83.55961536494472,1
|
||||
58.04023606927604,39.47235992846592,0
|
||||
40.15801957067056,94.28873609786281,1
|
||||
65.40785754453304,39.872039582416946,0
|
||||
58.25386824923051,64.96454852577446,1
|
||||
90.05150698066501,34.03096751205591,0
|
||||
72.24873848000416,90.1077757094509,1
|
||||
32.732305095404456,98.49269418173134,0
|
||||
74.06410532697512,66.96252809184301,1
|
||||
30.074888412046263,56.513104954256875,0
|
||||
87.57197590933474,68.15013081653733,1
|
||||
54.562040422189284,49.542441977062865,0
|
||||
78.30902280632358,72.23271250670665,1
|
||||
57.870305028845,48.514216465966285,0
|
||||
91.35751201085463,85.6201641726489,1
|
||||
32.89942225933118,68.89835152862396,0
|
||||
75.96271751468554,73.37079167632794,1
|
||||
49.73784613458287,59.13494209712587,0
|
||||
73.5544567377702,66.04140381033584,1
|
||||
34.20510941997501,72.62513617755425,0
|
||||
54.49230689236608,75.50968920375037,1
|
||||
48.50711697988822,47.74600670205531,0
|
||||
92.3876668476141,76.82950398511272,1
|
||||
39.89720264828788,62.09872615693186,0
|
||||
75.76883065897587,43.6375457580161,1
|
||||
32.938859931422954,75.6959591164835,0
|
||||
44.53335294213268,86.44202248365731,1
|
||||
51.265631719309845,60.12130845234037,0
|
||||
70.78776945843022,84.2462083261098,1
|
||||
28.94644639193278,39.599160546805116,0
|
||||
47.53708530844937,73.62887169594207,1
|
||||
49.02408652102979,48.50397486087145,0
|
||||
78.37067490088779,93.91476948225585,1
|
||||
48.806979396137145,62.206605350437144,0
|
||||
72.03919354554785,88.5636216577281,1
|
||||
31.23633606784064,96.30534895479137,0
|
||||
51.56156298671939,89.15548481990747,1
|
||||
65.08996501958059,39.488228986986606,0
|
||||
81.75983894249494,47.952028645978714,1
|
||||
46.466982795222684,43.17493123886225,0
|
||||
64.49601863360589,82.20819682836424,1
|
||||
65.59947425235588,42.79658543523777,0
|
||||
50.66778894002708,64.22662181783375,1
|
||||
30.665280235026138,42.70685221873931,0
|
||||
76.60228200416394,65.62163965042933,1
|
||||
60.39824874786827,38.54265995207925,0
|
||||
80.7498890348191,47.942468664004934,1
|
||||
81.83730756343084,39.62946723071423,0
|
||||
76.67188156208798,73.0039571691345,1
|
||||
31.702591304883626,73.4485451232566,0
|
||||
89.75853252236888,65.1794033434368,1
|
||||
31.111272744640324,77.90680809560692,0
|
||||
56.360076920020845,68.81541270666031,1
|
||||
47.365528695867354,59.268265092300844,0
|
||||
81.99701278469126,55.477765254828924,1
|
||||
73.19627144242138,28.399910031060564,0
|
||||
50.28593379220375,85.68597173591368,1
|
||||
30.532888808836397,77.17395841411421,0
|
||||
66.62736064332904,65.14099834530835,1
|
||||
30.563843972698294,44.15958836055778,0
|
||||
69.30483520344725,90.15732087213348,1
|
||||
40.63104177166124,61.47155968946135,0
|
||||
67.51887729702649,76.70896125160789,1
|
||||
33.6944962783859,43.961979616998335,0
|
||||
54.61941030575024,73.60040410454849,1
|
||||
29.956247697479498,91.60028497230863,0
|
||||
59.56176709683286,81.89054923262506,1
|
||||
29.097516205452173,92.0159604576793,0
|
||||
87.75444054660184,65.2841177353011,1
|
||||
79.14696413604753,40.118482227299694,0
|
||||
74.48492746059782,92.34246943037195,1
|
||||
26.332352061636747,44.9551699040027,0
|
||||
54.346942016509146,58.43293962287077,1
|
||||
29.947060203169244,93.06082834209418,0
|
||||
96.32633710641187,64.80350360838675,1
|
||||
29.864465690194475,73.11550264372423,0
|
||||
62.2263271267271,57.84956855286749,1
|
||||
35.2611254453108,72.85531587549292,0
|
||||
47.340681257438895,69.41232032562911,1
|
||||
63.19534209968015,36.963350930620166,0
|
||||
59.46464897992196,72.40245846384263,1
|
||||
60.08389682243888,42.48638233127113,0
|
||||
57.45295498601704,73.67928309399463,1
|
||||
|
100
PW-3/ex1/ex1-data-train.csv
Normal file
100
PW-3/ex1/ex1-data-train.csv
Normal file
@@ -0,0 +1,100 @@
|
||||
34.62365962451697,78.0246928153624,0
|
||||
30.28671076822607,43.89499752400101,0
|
||||
35.84740876993872,72.90219802708364,0
|
||||
60.18259938620976,86.30855209546826,1
|
||||
79.0327360507101,75.3443764369103,1
|
||||
45.08327747668339,56.3163717815305,0
|
||||
61.10666453684766,96.51142588489624,1
|
||||
75.02474556738889,46.55401354116538,1
|
||||
76.09878670226257,87.42056971926803,1
|
||||
84.43281996120035,43.53339331072109,1
|
||||
95.86155507093572,38.22527805795094,0
|
||||
75.01365838958247,30.60326323428011,0
|
||||
82.30705337399482,76.48196330235604,1
|
||||
69.36458875970939,97.71869196188608,1
|
||||
39.53833914367223,76.03681085115882,0
|
||||
53.9710521485623,89.20735013750205,1
|
||||
69.07014406283025,52.74046973016765,1
|
||||
67.94685547711617,46.67857410673128,0
|
||||
70.66150955499435,92.92713789364831,1
|
||||
76.97878372747498,47.57596364975532,1
|
||||
67.37202754570876,42.83843832029179,0
|
||||
89.67677575072079,65.79936592745237,1
|
||||
50.534788289883,48.85581152764205,0
|
||||
34.21206097786789,44.20952859866288,0
|
||||
77.9240914545704,68.9723599933059,1
|
||||
62.27101367004632,69.95445795447587,1
|
||||
80.1901807509566,44.82162893218353,1
|
||||
93.114388797442,38.80067033713209,0
|
||||
61.83020602312595,50.25610789244621,0
|
||||
38.78580379679423,64.99568095539578,0
|
||||
61.379289447425,72.80788731317097,1
|
||||
85.40451939411645,57.05198397627122,1
|
||||
52.10797973193984,63.12762376881715,0
|
||||
52.04540476831827,69.43286012045222,1
|
||||
40.23689373545111,71.16774802184875,0
|
||||
54.63510555424817,52.21388588061123,0
|
||||
33.91550010906887,98.86943574220611,0
|
||||
64.17698887494485,80.90806058670817,1
|
||||
74.78925295941542,41.57341522824434,0
|
||||
34.1836400264419,75.2377203360134,0
|
||||
83.90239366249155,56.30804621605327,1
|
||||
51.54772026906181,46.85629026349976,0
|
||||
94.44336776917852,65.56892160559052,1
|
||||
82.36875375713919,40.61825515970618,0
|
||||
51.04775177128865,45.82270145776001,0
|
||||
62.22267576120188,52.06099194836679,0
|
||||
77.19303492601364,70.45820000180959,1
|
||||
97.77159928000232,86.7278223300282,1
|
||||
62.07306379667647,96.76882412413983,1
|
||||
91.56497449807442,88.69629254546599,1
|
||||
79.94481794066932,74.16311935043758,1
|
||||
99.2725269292572,60.99903099844988,1
|
||||
90.54671411399852,43.39060180650027,1
|
||||
34.52451385320009,60.39634245837173,0
|
||||
50.2864961189907,49.80453881323059,0
|
||||
49.58667721632031,59.80895099453265,0
|
||||
97.64563396007767,68.86157272420604,1
|
||||
32.57720016809309,95.59854761387875,0
|
||||
74.24869136721598,69.82457122657193,1
|
||||
71.79646205863379,78.45356224515052,1
|
||||
75.3956114656803,85.75993667331619,1
|
||||
35.28611281526193,47.02051394723416,0
|
||||
56.25381749711624,39.26147251058019,0
|
||||
30.05882244669796,49.59297386723685,0
|
||||
44.66826172480893,66.45008614558913,0
|
||||
66.56089447242954,41.09209807936973,0
|
||||
40.45755098375164,97.53518548909936,1
|
||||
49.07256321908844,51.88321182073966,0
|
||||
80.27957401466998,92.11606081344084,1
|
||||
66.74671856944039,60.99139402740988,1
|
||||
32.72283304060323,43.30717306430063,0
|
||||
64.0393204150601,78.03168802018232,1
|
||||
72.34649422579923,96.22759296761404,1
|
||||
60.45788573918959,73.09499809758037,1
|
||||
58.84095621726802,75.85844831279042,1
|
||||
99.82785779692128,72.36925193383885,1
|
||||
47.26426910848174,88.47586499559782,1
|
||||
50.45815980285988,75.80985952982456,1
|
||||
60.45555629271532,42.50840943572217,0
|
||||
82.22666157785568,42.71987853716458,0
|
||||
88.9138964166533,69.80378889835472,1
|
||||
94.83450672430196,45.69430680250754,1
|
||||
67.31925746917527,66.58935317747915,1
|
||||
57.23870631569862,59.51428198012956,1
|
||||
80.36675600171273,90.96014789746954,1
|
||||
68.46852178591112,85.59430710452014,1
|
||||
42.0754545384731,78.84478600148043,0
|
||||
75.47770200533905,90.42453899753964,1
|
||||
78.63542434898018,96.64742716885644,1
|
||||
52.34800398794107,60.76950525602592,0
|
||||
94.09433112516793,77.15910509073893,1
|
||||
90.44855097096364,87.50879176484702,1
|
||||
55.48216114069585,35.57070347228866,0
|
||||
74.49269241843041,84.84513684930135,1
|
||||
89.84580670720979,45.35828361091658,1
|
||||
83.48916274498238,48.38028579728175,1
|
||||
42.2617008099817,87.10385094025457,1
|
||||
99.31500880510394,68.77540947206617,1
|
||||
55.34001756003703,64.9319380069486,1
|
||||
74.77589300092767,89.52981289513276,1
|
||||
|
745
PW-3/ex2/ex2-sys-eval-stud.ipynb
Normal file
745
PW-3/ex2/ex2-sys-eval-stud.ipynb
Normal file
@@ -0,0 +1,745 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bcf79585",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Exercice 2 - System evaluation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f642cedb",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Imports"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"id": "9421a4e1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a0d67fa6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5fe90672",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Define the path of the data file"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 38,
|
||||
"id": "ecd4a4cf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"path = \"ex2-system-a.csv\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "246e7392",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Read the CSV file using `read_csv`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 39,
|
||||
"id": "623096a5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataset_a = pd.read_csv(path, sep=\";\", index_col=False, names=[\"0\", \"1\", \"2\", \"3\", \"4\", \"5\", \"6\", \"7\", \"8\", \"9\", \"y_true\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6f764c56",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Display first rows"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 40,
|
||||
"id": "c59a1651",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>0</th>\n",
|
||||
" <th>1</th>\n",
|
||||
" <th>2</th>\n",
|
||||
" <th>3</th>\n",
|
||||
" <th>4</th>\n",
|
||||
" <th>5</th>\n",
|
||||
" <th>6</th>\n",
|
||||
" <th>7</th>\n",
|
||||
" <th>8</th>\n",
|
||||
" <th>9</th>\n",
|
||||
" <th>y_true</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>5.348450e-08</td>\n",
|
||||
" <td>7.493480e-10</td>\n",
|
||||
" <td>8.083470e-07</td>\n",
|
||||
" <td>2.082290e-05</td>\n",
|
||||
" <td>5.222360e-10</td>\n",
|
||||
" <td>2.330260e-08</td>\n",
|
||||
" <td>5.241270e-12</td>\n",
|
||||
" <td>9.999650e-01</td>\n",
|
||||
" <td>4.808590e-07</td>\n",
|
||||
" <td>0.000013</td>\n",
|
||||
" <td>7</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>1.334270e-03</td>\n",
|
||||
" <td>3.202960e-05</td>\n",
|
||||
" <td>8.504280e-01</td>\n",
|
||||
" <td>1.669090e-03</td>\n",
|
||||
" <td>1.546460e-07</td>\n",
|
||||
" <td>2.412940e-04</td>\n",
|
||||
" <td>1.448280e-01</td>\n",
|
||||
" <td>1.122810e-11</td>\n",
|
||||
" <td>1.456330e-03</td>\n",
|
||||
" <td>0.000011</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>3.643050e-06</td>\n",
|
||||
" <td>9.962760e-01</td>\n",
|
||||
" <td>2.045910e-03</td>\n",
|
||||
" <td>4.210530e-04</td>\n",
|
||||
" <td>2.194020e-05</td>\n",
|
||||
" <td>1.644130e-05</td>\n",
|
||||
" <td>2.838160e-04</td>\n",
|
||||
" <td>3.722960e-04</td>\n",
|
||||
" <td>5.150120e-04</td>\n",
|
||||
" <td>0.000044</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>9.998200e-01</td>\n",
|
||||
" <td>2.550390e-10</td>\n",
|
||||
" <td>1.112010e-05</td>\n",
|
||||
" <td>1.653200e-05</td>\n",
|
||||
" <td>5.375730e-10</td>\n",
|
||||
" <td>8.999750e-05</td>\n",
|
||||
" <td>9.380920e-06</td>\n",
|
||||
" <td>4.464470e-05</td>\n",
|
||||
" <td>2.418440e-06</td>\n",
|
||||
" <td>0.000006</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>2.092460e-08</td>\n",
|
||||
" <td>7.464220e-08</td>\n",
|
||||
" <td>3.560820e-05</td>\n",
|
||||
" <td>5.496200e-07</td>\n",
|
||||
" <td>9.988960e-01</td>\n",
|
||||
" <td>3.070920e-08</td>\n",
|
||||
" <td>2.346150e-04</td>\n",
|
||||
" <td>9.748010e-07</td>\n",
|
||||
" <td>1.071610e-06</td>\n",
|
||||
" <td>0.000831</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" 0 1 2 3 4 \\\n",
|
||||
"0 5.348450e-08 7.493480e-10 8.083470e-07 2.082290e-05 5.222360e-10 \n",
|
||||
"1 1.334270e-03 3.202960e-05 8.504280e-01 1.669090e-03 1.546460e-07 \n",
|
||||
"2 3.643050e-06 9.962760e-01 2.045910e-03 4.210530e-04 2.194020e-05 \n",
|
||||
"3 9.998200e-01 2.550390e-10 1.112010e-05 1.653200e-05 5.375730e-10 \n",
|
||||
"4 2.092460e-08 7.464220e-08 3.560820e-05 5.496200e-07 9.988960e-01 \n",
|
||||
"\n",
|
||||
" 5 6 7 8 9 y_true \n",
|
||||
"0 2.330260e-08 5.241270e-12 9.999650e-01 4.808590e-07 0.000013 7 \n",
|
||||
"1 2.412940e-04 1.448280e-01 1.122810e-11 1.456330e-03 0.000011 2 \n",
|
||||
"2 1.644130e-05 2.838160e-04 3.722960e-04 5.150120e-04 0.000044 1 \n",
|
||||
"3 8.999750e-05 9.380920e-06 4.464470e-05 2.418440e-06 0.000006 0 \n",
|
||||
"4 3.070920e-08 2.346150e-04 9.748010e-07 1.071610e-06 0.000831 4 "
|
||||
]
|
||||
},
|
||||
"execution_count": 40,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dataset_a.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "41f040b0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Store some useful statistics (class names + number of classes)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 41,
|
||||
"id": "fd0adce4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class_names = [\"0\", \"1\", \"2\", \"3\", \"4\", \"5\", \"6\", \"7\", \"8\", \"9\"]\n",
|
||||
"nb_classes = len(class_names)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5a0ab85a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Exercise's steps"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "66ae582e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"a) Write a function to take classification decisions on such outputs according to Bayes’rule."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 42,
|
||||
"id": "3c36b377",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def bayes_classification(df):\n",
|
||||
" \"\"\"\n",
|
||||
" Take classification decisions according to Bayes rule.\n",
|
||||
"\n",
|
||||
" Parameters\n",
|
||||
" ----------\n",
|
||||
" df : Pandas DataFrame of shape (n_samples, n_features + ground truth)\n",
|
||||
" Dataset.\n",
|
||||
"\n",
|
||||
" Returns\n",
|
||||
" -------\n",
|
||||
" preds : Numpy array of shape (n_samples,)\n",
|
||||
" Class labels for each data sample.\n",
|
||||
" \"\"\"\n",
|
||||
" y_pred = []\n",
|
||||
" for i in range(df.shape[0]):\n",
|
||||
" index = np.argmax(df.iloc[i,:10]) # take all the line except the y value\n",
|
||||
" y_pred.append(index)\n",
|
||||
" \n",
|
||||
" return y_pred\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b5e8140b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"b) What is the overall error rate of the system ?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"id": "f3b21bfb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error rate = 0.10729999999999995\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Your code here: compute and print the error rate of the system\n",
|
||||
"y_pred_a = bayes_classification(dataset_a)\n",
|
||||
"\n",
|
||||
"correct = 0\n",
|
||||
"for i in range(0, len(y_pred_a)):\n",
|
||||
" if(dataset_a.iloc[i,10] == y_pred_a[i]):\n",
|
||||
" correct += 1\n",
|
||||
"\n",
|
||||
"success = correct/len(y_pred_a)\n",
|
||||
"print(f\"Error rate = {1-success}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a4f0fa5f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"c) Compute and report the confusion matrix of the system."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 44,
|
||||
"id": "bb106415",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def confusion_matrix(y_true, y_pred, n_classes):\n",
|
||||
" \"\"\"\n",
|
||||
" Compute the confusion matrix.\n",
|
||||
" \n",
|
||||
" Parameters\n",
|
||||
" ----------\n",
|
||||
" y_true : Numpy array of shape (n_samples,)\n",
|
||||
" Ground truth.\n",
|
||||
" y_pred : Numpy array of shape (n_samples,)\n",
|
||||
" Predictions.\n",
|
||||
" n_classes : Integer\n",
|
||||
" Number of classes.\n",
|
||||
" \n",
|
||||
" Returns\n",
|
||||
" -------\n",
|
||||
" cm : Numpy array of shape (n_classes, n_classes)\n",
|
||||
" Confusion matrix.\n",
|
||||
" \"\"\"\n",
|
||||
" matrix = np.zeros((n_classes, n_classes))\n",
|
||||
"\n",
|
||||
" for i in range(0, len(y_pred)):\n",
|
||||
" matrix[y_true[i], y_pred[i]] += 1 \n",
|
||||
"\n",
|
||||
" return matrix"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"id": "1b38e3a8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 0 1 2 3 4 5 6 7 8 9\n",
|
||||
" 0 | 944 0 11 0 0 2 10 7 5 1\n",
|
||||
" 1 | 0 1112 2 3 1 4 3 1 9 0\n",
|
||||
" 2 | 10 6 921 12 15 3 19 15 26 5\n",
|
||||
"t 3 | 1 1 31 862 2 72 5 14 12 10\n",
|
||||
"r 4 | 2 3 6 2 910 1 12 6 4 36\n",
|
||||
"u 5 | 12 3 6 29 19 768 19 9 21 6\n",
|
||||
"e 6 | 14 3 21 2 22 28 865 0 3 0\n",
|
||||
" 7 | 0 14 30 9 7 2 1 929 3 33\n",
|
||||
" 8 | 12 16 18 26 24 46 22 19 772 19\n",
|
||||
" 9 | 10 4 6 22 53 18 0 48 4 844\n",
|
||||
" predicted \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Your code here: compute and print the confusion matrix\n",
|
||||
"\n",
|
||||
"cm_a = confusion_matrix(dataset_a.iloc[:,10], y_pred_a, nb_classes)\n",
|
||||
"\n",
|
||||
"#headers\n",
|
||||
"print(\" \", end=\"\")\n",
|
||||
"for j in range(nb_classes):\n",
|
||||
" print(f\"{j:5d}\", end=\"\")\n",
|
||||
"print()\n",
|
||||
"\n",
|
||||
"#rows\n",
|
||||
"for i in range(nb_classes):\n",
|
||||
" match i:\n",
|
||||
" case 3:\n",
|
||||
" print(\"t\", end=\"\")\n",
|
||||
" case 4:\n",
|
||||
" print(\"r\", end=\"\")\n",
|
||||
" case 5:\n",
|
||||
" print(\"u\", end=\"\")\n",
|
||||
" case 6:\n",
|
||||
" print(\"e\", end=\"\")\n",
|
||||
" case _:\n",
|
||||
" print(\" \", end=\"\")\n",
|
||||
"\n",
|
||||
" print(f\"{i:3d} |\", end=\"\")\n",
|
||||
" for j in range(nb_classes):\n",
|
||||
" print(f\"{int(cm_a[i, j]):5d}\", end=\"\")\n",
|
||||
"\n",
|
||||
" print()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"print(\" predicted \")\n",
|
||||
"# print(cm.astype(int))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0cf5380f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ed8db908",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"d) What are the worst and best classes in terms of precision and recall ?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 46,
|
||||
"id": "0e229ce0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def precision_per_class(cm):\n",
|
||||
" \"\"\"\n",
|
||||
" Compute the precision per class.\n",
|
||||
" \n",
|
||||
" Parameters\n",
|
||||
" ----------\n",
|
||||
" cm : Numpy array of shape (n_classes, n_classes)\n",
|
||||
" Confusion matrix.\n",
|
||||
" \n",
|
||||
" Returns\n",
|
||||
" -------\n",
|
||||
" precisions : Numpy array of shape (n_classes,)\n",
|
||||
" Precision per class.\n",
|
||||
" \"\"\"\n",
|
||||
" rates = []\n",
|
||||
" for i in range(cm.shape[1]):\n",
|
||||
" correct = cm[i,i]\n",
|
||||
" incorrect = 0\n",
|
||||
" for j in range(cm.shape[0]):\n",
|
||||
" if i != j:\n",
|
||||
" incorrect += cm[j,i]\n",
|
||||
"\n",
|
||||
" rates.append(correct/(correct+incorrect))\n",
|
||||
"\n",
|
||||
" return rates\n",
|
||||
" \n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 47,
|
||||
"id": "95325772",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def recall_per_class(cm):\n",
|
||||
" \"\"\"\n",
|
||||
" Compute the recall per class.\n",
|
||||
" \n",
|
||||
" Parameters\n",
|
||||
" ----------\n",
|
||||
" cm : Numpy array of shape (n_classes, n_classes)\n",
|
||||
" Confusion matrix.\n",
|
||||
" \n",
|
||||
" Returns\n",
|
||||
" -------\n",
|
||||
" recalls : Numpy array of shape (n_classes,)\n",
|
||||
" Recall per class.\n",
|
||||
" \"\"\"\n",
|
||||
" rates = []\n",
|
||||
" for i in range(cm.shape[0]):\n",
|
||||
" correct = cm[i,i]\n",
|
||||
" incorrect = 0\n",
|
||||
" for j in range(cm.shape[1]):\n",
|
||||
" if i != j:\n",
|
||||
" incorrect += cm[i,j]\n",
|
||||
"\n",
|
||||
" rates.append(correct/(correct+incorrect))\n",
|
||||
"\n",
|
||||
" return rates"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"id": "a0fb19e3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Class 0, precision = 0.9393034825870646\n",
|
||||
"Class 1, precision = 0.9569707401032702\n",
|
||||
"Class 2, precision = 0.8754752851711026\n",
|
||||
"Class 3, precision = 0.8914167528438469\n",
|
||||
"Class 4, precision = 0.8641975308641975\n",
|
||||
"Class 5, precision = 0.8135593220338984\n",
|
||||
"Class 6, precision = 0.9048117154811716\n",
|
||||
"Class 7, precision = 0.8864503816793893\n",
|
||||
"Class 8, precision = 0.8987194412107101\n",
|
||||
"Class 9, precision = 0.8846960167714885\n",
|
||||
"\n",
|
||||
"Best = class 1, 0.9569707401032702\n",
|
||||
"Worst = class 5, 0.8135593220338984\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Your code here: find and print the worst and best classes in terms of precision\n",
|
||||
"precision_a = precision_per_class(cm_a)\n",
|
||||
"\n",
|
||||
"for i in range(len(precision_a)):\n",
|
||||
" print(f\"Class {i}, precision = {precision_a[i]}\")\n",
|
||||
"\n",
|
||||
"print(\"\")\n",
|
||||
"\n",
|
||||
"print(f\"Best = class {np.argmax(precision_a)}, {precision_a[np.argmax(precision_a)]}\")\n",
|
||||
"print(f\"Worst = class {np.argmin(precision_a)}, {precision_a[np.argmin(precision_a)]}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"id": "42c3edd8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Class 0, recall = 0.963265306122449\n",
|
||||
"Class 1, recall = 0.9797356828193833\n",
|
||||
"Class 2, recall = 0.8924418604651163\n",
|
||||
"Class 3, recall = 0.8534653465346534\n",
|
||||
"Class 4, recall = 0.9266802443991853\n",
|
||||
"Class 5, recall = 0.8609865470852018\n",
|
||||
"Class 6, recall = 0.9029227557411273\n",
|
||||
"Class 7, recall = 0.9036964980544747\n",
|
||||
"Class 8, recall = 0.7926078028747433\n",
|
||||
"Class 9, recall = 0.8364717542120912\n",
|
||||
"\n",
|
||||
"Best = class 1, 0.9797356828193833\n",
|
||||
"Worst = class 8, 0.7926078028747433\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Your code here: find and print the worst and best classes in terms of recall\n",
|
||||
"\n",
|
||||
"recall_a = recall_per_class(cm_a)\n",
|
||||
"\n",
|
||||
"for i in range(len(recall_a)):\n",
|
||||
" print(f\"Class {i}, recall = {recall_a[i]}\")\n",
|
||||
"\n",
|
||||
"print(\"\")\n",
|
||||
"\n",
|
||||
"print(f\"Best = class {np.argmax(recall_a)}, {recall_a[np.argmax(recall_a)]}\")\n",
|
||||
"print(f\"Worst = class {np.argmin(recall_a)}, {recall_a[np.argmin(recall_a)]}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7ac6fe5d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"e) In file `ex1-system-b.csv` you find the output of a second system B. What is the best system between (a) and (b) in terms of error rate and F1."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 50,
|
||||
"id": "b98c2545",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Your code here: load the data of the system B\n",
|
||||
"path = \"ex2-system-b.csv\"\n",
|
||||
"dataset_b = pd.read_csv(path, sep=\";\", index_col=False, names=[\"0\", \"1\", \"2\", \"3\", \"4\", \"5\", \"6\", \"7\", \"8\", \"9\", \"y_true\"])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 51,
|
||||
"id": "050091b9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def system_accuracy(cm):\n",
|
||||
" \"\"\"\n",
|
||||
" Compute the system accuracy.\n",
|
||||
" \n",
|
||||
" Parameters\n",
|
||||
" ----------\n",
|
||||
" cm : Numpy array of shape (n_classes, n_classes)\n",
|
||||
" Confusion matrix.\n",
|
||||
" \n",
|
||||
" Returns\n",
|
||||
" -------\n",
|
||||
" accuracy : Float\n",
|
||||
" Accuracy of the system.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" diag = 0\n",
|
||||
" for i in range(cm.shape[0]):\n",
|
||||
" diag += cm[i,i]\n",
|
||||
"\n",
|
||||
" acc = diag / np.sum(cm)\n",
|
||||
" return acc"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 52,
|
||||
"id": "adc0f138",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def system_f1_score(cm):\n",
|
||||
" \"\"\"\n",
|
||||
" Compute the system F1 score.\n",
|
||||
" \n",
|
||||
" Parameters\n",
|
||||
" ----------\n",
|
||||
" cm : Numpy array of shape (n_classes, n_classes)\n",
|
||||
" Confusion matrix.\n",
|
||||
" \n",
|
||||
" Returns\n",
|
||||
" -------\n",
|
||||
" f1_score : Float\n",
|
||||
" F1 score of the system.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" f1 = []\n",
|
||||
" precision = precision_per_class(cm)\n",
|
||||
" recall = recall_per_class(cm)\n",
|
||||
"\n",
|
||||
" for i in range(0, len(precision)):\n",
|
||||
" f1.append(2*((precision[i] * recall[i])/(precision[i] + recall[i])))\n",
|
||||
" return np.sum(f1)/len(f1)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 53,
|
||||
"id": "f1385c87",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"System A accuracy = 0.8927\n",
|
||||
"System A f1 = 0.8907308492877297\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Your code here: compute and print the accuracy and the F1 score of the system A\n",
|
||||
"\n",
|
||||
"acc_a = system_accuracy(cm_a)\n",
|
||||
"print(f\"System A accuracy = {acc_a}\")\n",
|
||||
"\n",
|
||||
"f1_a = system_f1_score(cm_a)\n",
|
||||
"\n",
|
||||
"print(f\"System A f1 = {f1_a}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 54,
|
||||
"id": "50c64d08",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"System A accuracy = 0.9613\n",
|
||||
"System A f1 = 0.9608568150389065\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Your code here: compute and print the accuracy and the F1 score of the system B\n",
|
||||
"y_pred_b = bayes_classification(dataset_b)\n",
|
||||
"cm_b = confusion_matrix(dataset_b.iloc[:,10], y_pred_b, nb_classes)\n",
|
||||
"\n",
|
||||
"acc_b = system_accuracy(cm_b)\n",
|
||||
"print(f\"System A accuracy = {acc_b}\")\n",
|
||||
"\n",
|
||||
"f1_b = system_f1_score(cm_b)\n",
|
||||
"\n",
|
||||
"print(f\"System A f1 = {f1_b}\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
10000
PW-3/ex2/ex2-system-a.csv
Normal file
10000
PW-3/ex2/ex2-system-a.csv
Normal file
File diff suppressed because it is too large
Load Diff
10000
PW-3/ex2/ex2-system-b.csv
Normal file
10000
PW-3/ex2/ex2-system-b.csv
Normal file
File diff suppressed because it is too large
Load Diff
317
PW-3/ex3/ex3-review-questions-stud.ipynb
Normal file
317
PW-3/ex3/ex3-review-questions-stud.ipynb
Normal file
@@ -0,0 +1,317 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "74682f1a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Done by Aviolat Charline, Bach Joachim and Marino Gabriel"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ad0d40d6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Exercice 3 - Review questions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3e556a9d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**a) Assuming an univariate input *x*, what is the complexity at inference time of a Bayesian classifier based on histogram computation of the likelihood ?**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8d2fb7ef",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"For each class, we must compute the likelyhood, which is one calculus per class, so O(nb_class). Then, for each x we must compute the posteriori probability, which is looking into a pre-computed histogram (done in the training phase), so this is O(nb_class). The a priori probability only needs to be computed for each class, so O(nb_class). So, the total complexity of the Bayesian classifier is O(2 * nb_class * nb_x)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "99632770",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**b) Bayesian models are said to be generative as they can be used to generate new samples. Taking the implementation of the exercise 1.a, explain the steps to generate new samples using the system you have put into place.**\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "88ab64b2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"To generate a new sample, we need to create a y and a x. This can be done by firstly picking the class Ck randomly according to the a priori probabilities P(Ck). This means that, if there is two classes and P(C1) = 0.6 and P(C2) = 0.4, we take 60% of the time C1 and 40% of the time C2\n",
|
||||
"\n",
|
||||
"Then, we can pick a random x based on the probability density function p(x|Ck). This means we choose a class and, in the density function (like histogram), we take a random x based on the probablilities. If there is two x values possibles and one is distributed as 0.4 and the other 0.6, we will taxe x1 40% of the time and x2 60%"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e2f611fe",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"***Optional*: Provide an implementation in a function generateSample(priors, histValues, edgeValues, n)**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "14aba0f7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def generateSample(priors, histValues, edgeValues, n):\n",
|
||||
" # pick a class according to the proba\n",
|
||||
" # to do that, compute the different probabilities sum. This is done by creating intervals between 0 and 1. The size of those intervals represents the probability of the random\n",
|
||||
" # number generator to land on it.\n",
|
||||
" cumulative_probs = np.cumsum(priors)\n",
|
||||
"\n",
|
||||
" # take a random number and see in which interval it falls. The index of this interval will be the class we chose\n",
|
||||
" chosen_class = 0\n",
|
||||
" r = random.random() \n",
|
||||
" for i, cp in enumerate(cumulative_probs):\n",
|
||||
" if r < cp:\n",
|
||||
" chosen_class = i\n",
|
||||
" break\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" # The same logic is used to find the new x value. We take the proba of x given c and chose randomly weighted by those proba.\n",
|
||||
" # we have to compute the \"probabilities\" differently, because the histogram is only the count of each x in the c.\n",
|
||||
" # here, we kept the count instead of proba and when generating the random number, instead of chosing between 0 1 and 1 we chose between 0 and total_hist\n",
|
||||
" # which does the same job in the end\n",
|
||||
" total_hist = np.sum(histValues[chosen_class])\n",
|
||||
"\n",
|
||||
" cumulative_probs_hist = np.cumsum(histValues[chosen_class])\n",
|
||||
"\n",
|
||||
" # take a random number and see in which interval it falls. The index of this interval will be the class we chose\n",
|
||||
" chosen_x_index = 0\n",
|
||||
" r = random.uniform(0, total_hist) \n",
|
||||
" for i, cp in enumerate(cumulative_probs_hist):\n",
|
||||
" if r < cp:\n",
|
||||
" chosen_x_index = i\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
" chosen_x = edgeValues[chosen_x_index]\n",
|
||||
" \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ed8c4f6b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**c) What is the minimum overall accuracy of a 2-class system relying only on priors and that is built on a training set that includes 5 times more samples in class A than in class B?**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4bb03365",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If we only take the priors, then the posterior probability only depends on it. The system will chose the highest posterior probability, so the highest prior because it is all it has. This means it will always choose the class A. If the repartition of the test set is the same as the training set, then always choosing A will give a 5/6 success rate, which will be all the correct A and all the missed B. If the test set is balanced, the success rate will be 50% because it will find all the A and miss all the B. Finally, if the system is unbalanced in the other way, the success rate will only be the portion of the A class in comparaison to the B class. The absolute minimum is then how low the portion of A can be compared to B in the test set."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "58450ff6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**d) Let’s look back at the PW02 exercise 3 of last week. We have built a knn classification systems for images of digits on the MNIST database.**\n",
|
||||
"\n",
|
||||
"**How would you build a Bayesian classification for the same task ? Comment on the prior probabilities and on the likelihood estimators. More specifically, what kind of likelihood estimator could we use in this case ?**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d2bf1500",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The a priori probability is simply the repartition of each class in the dataset.\n",
|
||||
"The likelihood is the tricky part, because the system would need to be multivariate (because of all the pixels), which makes it very complex. We could use the Naive Bayes formula witch states that the features (pixels) are completely uncorrelated and then we cound perform the operation pixel per pixel for each image. However, the pixels ARE NOT uncorrelated, because a pixel is spatially positionned. If a pixel is white, there is a strong change that there are white pixels somewhere around as well. The Naive Bayes would technically still work-ish, but with a false presomption.\n",
|
||||
"\n",
|
||||
"To do it correctly, we would have to use something like the multivariate gaussian distribution"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a3ca9715",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"***Optional:* implement it and report performance !**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "4de72736",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Training data shape: (10000, 28, 28)\n",
|
||||
"Training labels shape: (10000,)\n",
|
||||
"Test data shape: (10000, 28, 28)\n",
|
||||
"Test labels shape: (10000,)\n",
|
||||
"(10000, 784) (10000, 784)\n",
|
||||
"Accuracy score : 0.5711\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import pandas as pd\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# This is a method to read the MNIST dataset from a ROOT directory\n",
|
||||
"def load_MNIST(ROOT):\n",
|
||||
" '''load all of mnist\n",
|
||||
" training set first'''\n",
|
||||
" Xtr = []\n",
|
||||
" train = pd.read_csv(os.path.join(ROOT, 'mnist_train.csv'))\n",
|
||||
" X = np.array(train.drop('label', axis=1))\n",
|
||||
" Ytr = np.array(train['label'])\n",
|
||||
" # With this for-loop we give the data a shape of the acctual image (28x28)\n",
|
||||
" # instead of the shape in file (1x784)\n",
|
||||
" for row in X:\n",
|
||||
" Xtr.append(row.reshape(28,28))\n",
|
||||
" # load test set second\n",
|
||||
" Xte = []\n",
|
||||
" test = pd.read_csv(os.path.join(ROOT, 'mnist_test.csv'))\n",
|
||||
" X = np.array(test.drop('label', axis=1))\n",
|
||||
" Yte = np.array(test['label'])\n",
|
||||
" # same reshaping\n",
|
||||
" for row in X:\n",
|
||||
" Xte.append(row.reshape(28,28))\n",
|
||||
" \n",
|
||||
" return np.array(Xtr), np.array(Ytr), np.array(Xte), np.array(Yte)\n",
|
||||
"\n",
|
||||
"# Load the raw MNIST data.\n",
|
||||
"mnist_dir = '' \n",
|
||||
"X_train, y_train, X_test, y_test = load_MNIST(mnist_dir)\n",
|
||||
"\n",
|
||||
"# As a sanity check, we print out the size of the training and test data.\n",
|
||||
"print('Training data shape: ', X_train.shape)\n",
|
||||
"print('Training labels shape: ', y_train.shape)\n",
|
||||
"print('Test data shape: ', X_test.shape)\n",
|
||||
"print('Test labels shape: ', y_test.shape)\n",
|
||||
"X_train = np.reshape(X_train, (X_train.shape[0], -1)) \n",
|
||||
"X_test = np.reshape(X_test, (X_test.shape[0], -1)) \n",
|
||||
"\n",
|
||||
"print(X_train.shape, X_test.shape)\n",
|
||||
"def predict_gaussian(X_test, mu, sigma2, priors):\n",
|
||||
" n_samples = X_test.shape[0]\n",
|
||||
" y_pred = np.zeros(n_samples)\n",
|
||||
"\n",
|
||||
" K, n_pixels = mu.shape\n",
|
||||
" \n",
|
||||
" for idx in range(n_samples):\n",
|
||||
" x = X_test[idx]\n",
|
||||
" proba_classes = np.zeros(K)\n",
|
||||
"\n",
|
||||
" for c in range(K):\n",
|
||||
" log_likelihood = -0.5 * np.log(2 * np.pi * sigma2[c]) - ((x - mu[c])**2) / (2 * sigma2[c])\n",
|
||||
" proba_classes[c] = np.log(priors[c]) + np.sum(log_likelihood)\n",
|
||||
" \n",
|
||||
" y_pred[idx] = np.argmax(proba_classes)\n",
|
||||
"\n",
|
||||
" return y_pred\n",
|
||||
"classes = np.unique(y_train)\n",
|
||||
"priors = np.array([np.mean(y_train == c) for c in classes])\n",
|
||||
"\n",
|
||||
"n_pixels = X_train.shape[1]\n",
|
||||
"\n",
|
||||
"mu = np.zeros((len(classes), n_pixels))\n",
|
||||
"sigma2 = np.zeros((len(classes), n_pixels))\n",
|
||||
"\n",
|
||||
"for c in classes:\n",
|
||||
" X_c = X_train[y_train == c]\n",
|
||||
" mu[c, :] = X_c.mean(axis=0)\n",
|
||||
" sigma2[c, :] = X_c.var(axis=0) + 1e-5\n",
|
||||
" \n",
|
||||
"y_pred = predict_gaussian(X_test, mu, sigma2, priors)\n",
|
||||
"accuracy = np.mean(y_pred == y_test)\n",
|
||||
"\n",
|
||||
"print(\"Accuracy score :\", accuracy)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "07cb7aee",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The .57 accuracy observed here might prove that the method is not the right one for this type of problems, because if each pixel is a feature, the number of dimensions become way to big. This might also be caused by the fact that pixels are not uncorrelated."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b812b46f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**e) Read [europe-border-control-ai-lie-detector](https://theintercept.com/2019/07/26/europe-border-control-ai-lie-detector/). The described system is \"a virtual policeman designed to strengthen European borders\". It can be seen as a 2-class problem, either you are a suspicious traveler or you are not. If you are declared as suspicious by the system, you are routed to a human border agent who analyses your case in a more careful way.**\n",
|
||||
"\n",
|
||||
"1. What kind of errors can the system make ? Explain them in your own words.\n",
|
||||
"2. Is one error more critical than the other ? Explain why.\n",
|
||||
"3. According to the previous points, which metric would you recommend to tune your MLsystem ?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1adf1760",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"1. The system can make false positives or false negatives. This means it could say that an innocent man is a threat or that a dangerous person is safe to cross the border.\n",
|
||||
"2. Yes, a false negative is the most critical one. In the case of a false positive, the only consequence is a lost of time because you have to interrogate the \"suspect\", maybe resulting in a angry customer. On the other hand, a false negative means a real threat has entered the country and has not been detect, wich could have way more concequences than an angry customer.\n",
|
||||
"3. In this case, we could use the Area Under the Curve with this system. This would allow to tune the the treshold and impact the decision to tend more to false positives rather than false negatives"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "195a1f73-c0f7-4707-9551-c71bfa379960",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**f) When a deep learning architecture is trained using an unbalanced training set, we usually observe a problem of bias, i.e. the system favors one class over another one. Using the Bayes equation, explain what is the origin of the problem.**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fa5ffd45-0645-4093-9a1b-0a7aeaeece0e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The bayes equation : P(Ck|x) = (p(x|Ck)*P(Ck))/p(x).\n",
|
||||
"\n",
|
||||
"The a priori probability (P(Ck)) is what reprensents the unbalance in the training set. This value is the probability to have this class, so it is linked to the number of data in it. This means that a class with 3x more data in it will have a way bigger P(Ck) witch will impact the decision in favor of the biggest P(Ck)."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
202
PW-4/lausanne-appart.csv
Normal file
202
PW-4/lausanne-appart.csv
Normal file
@@ -0,0 +1,202 @@
|
||||
living_area,nb_rooms,rent_price
|
||||
69,3,1810
|
||||
95,3.5,2945
|
||||
21,1.5,685
|
||||
20,1,720
|
||||
33,1.5,830
|
||||
13,1,850
|
||||
17,1,850
|
||||
27,1,855
|
||||
32,1,875
|
||||
26,1.5,890
|
||||
25,1,890
|
||||
31,1,900
|
||||
18,1,900
|
||||
24,1,900
|
||||
25,1,920
|
||||
25,1,930
|
||||
27,1,950
|
||||
37,2,955
|
||||
28,1,960
|
||||
39,1,970
|
||||
31,1,980
|
||||
25,1,980
|
||||
25,1,980
|
||||
29,1.5,1000
|
||||
38,1,1015
|
||||
16,1,1040
|
||||
20,1,1060
|
||||
50,2,1100
|
||||
37,1.5,1130
|
||||
33,2,1160
|
||||
40,1,1200
|
||||
46,1,1210
|
||||
45,2,1235
|
||||
19,1,1250
|
||||
45,2,1310
|
||||
56,2,1315
|
||||
60,2.5,1320
|
||||
23,1,1350
|
||||
49,2,1370
|
||||
51,1.5,1385
|
||||
48,2,1390
|
||||
51,1,1400
|
||||
41,2,1400
|
||||
47,2,1410
|
||||
45,2,1410
|
||||
47,2,1415
|
||||
37,1.5,1420
|
||||
52,2,1445
|
||||
45,2,1450
|
||||
43,2,1450
|
||||
26,1.5,1470
|
||||
49,2,1480
|
||||
49,2.5,1490
|
||||
39,1,1530
|
||||
60,2,1530
|
||||
65,2.5,1580
|
||||
60,3,1590
|
||||
60,3,1590
|
||||
47,1,1595
|
||||
57,2,1630
|
||||
65,2,1640
|
||||
33,1,1650
|
||||
56,2.5,1660
|
||||
69,3,1690
|
||||
61,2.5,1690
|
||||
60,3,1690
|
||||
47,1.5,1700
|
||||
49,2,1700
|
||||
60,2.5,1700
|
||||
72,3,1715
|
||||
70,2.5,1730
|
||||
59,2,1750
|
||||
30,1,1750
|
||||
39,2,1750
|
||||
25,1,1780
|
||||
68,3,1790
|
||||
63,3,1790
|
||||
78,2,1790
|
||||
50,2.5,1800
|
||||
70,3,1810
|
||||
75,2.5,1830
|
||||
60,3,1840
|
||||
70,3.5,1840
|
||||
28,1,1850
|
||||
62,2.5,1860
|
||||
90,3,1870
|
||||
78,3.5,1920
|
||||
80,2,1930
|
||||
72,3,1940
|
||||
78,3.5,1950
|
||||
62,2.5,1980
|
||||
80,3.5,1990
|
||||
80,3.5,2000
|
||||
75,4,2030
|
||||
68,3,2040
|
||||
76,3.5,2060
|
||||
81,3,2080
|
||||
92,3,2085
|
||||
75,3,2090
|
||||
82,3.5,2130
|
||||
80,3.5,2130
|
||||
95,4,2145
|
||||
85,4.5,2160
|
||||
58,2.5,2160
|
||||
33,1.5,2170
|
||||
94,4.5,2190
|
||||
100,3.5,2250
|
||||
77,3.5,2270
|
||||
80,3.5,2270
|
||||
80,3.5,2290
|
||||
92,3.5,2320
|
||||
92,3.5,2335
|
||||
99,4.5,2335
|
||||
98,3,2358
|
||||
90,4.5,2360
|
||||
96,3,2380
|
||||
92,4.5,2380
|
||||
86,4.5,2390
|
||||
73,3,2400
|
||||
80,3.5,2400
|
||||
96,3,2403
|
||||
72,3.5,2410
|
||||
91,4,2420
|
||||
53,2.5,2425
|
||||
60,2.5,2490
|
||||
95,3.5,2500
|
||||
97,4.5,2530
|
||||
103,4.5,2550
|
||||
105,5,2550
|
||||
112,3.5,2550
|
||||
110,4,2560
|
||||
107,5,2570
|
||||
65,1.5,2570
|
||||
97,4.5,2590
|
||||
110,4.5,2625
|
||||
102,4.5,2635
|
||||
101,4.5,2675
|
||||
98,3.5,2700
|
||||
109,4.5,2710
|
||||
120,5,2710
|
||||
107,4,2720
|
||||
125,3.5,2725
|
||||
120,4.5,2750
|
||||
108,5,2805
|
||||
130,5.5,2820
|
||||
112,4.5,2825
|
||||
121,4.5,2830
|
||||
118,4.5,2840
|
||||
107,4.5,2840
|
||||
87,3.5,2850
|
||||
114,4,2850
|
||||
110,4.5,2870
|
||||
118,4.5,2875
|
||||
126,5.5,2900
|
||||
112,4.5,2915
|
||||
93,3,2945
|
||||
80,3.5,2950
|
||||
116,4.5,3050
|
||||
145,4,3050
|
||||
95,4.5,3080
|
||||
100,3.5,3090
|
||||
98,3.5,3090
|
||||
94,4.5,3100
|
||||
110,4.5,3150
|
||||
100,4.5,3160
|
||||
110,2,3180
|
||||
109,4.5,3220
|
||||
131,5,3300
|
||||
133,5,3300
|
||||
86,2.5,3350
|
||||
84,3,3400
|
||||
145,4,3450
|
||||
113,4.5,3490
|
||||
130,3.5,3500
|
||||
108,3,3525
|
||||
94,4.5,3570
|
||||
136,4.5,3765
|
||||
140,4.5,3765
|
||||
125,3.5,3790
|
||||
156,5.5,3930
|
||||
91,3.5,3950
|
||||
130,4.5,3965
|
||||
102,4.5,4061
|
||||
130,4.5,4200
|
||||
142,5.5,4260
|
||||
88,3.5,4310
|
||||
178,6.5,4760
|
||||
150,4.5,4800
|
||||
185,5.5,4900
|
||||
164,6.5,5160
|
||||
214,5.5,5200
|
||||
191,6,5229
|
||||
156,5.5,5250
|
||||
145,5,5383
|
||||
175,5,5460
|
||||
150,4.5,5500
|
||||
129,5.5,5560
|
||||
160,4,5775
|
||||
201,5.5,6200
|
||||
240,6.5,6700
|
||||
145,5,7383
|
||||
|
1245
PW-4/pw4-linear-regression-stud.ipynb
Normal file
1245
PW-4/pw4-linear-regression-stud.ipynb
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user