Compare commits

...

11 Commits

Author SHA1 Message Date
Joachim Bach
14cf888160 done pw4 but not sure about ex7 2025-10-11 11:58:40 +02:00
Joachim Bach
d23d2a51c4 done ex6 2025-10-11 11:14:09 +02:00
Joachim Bach
8ec28596c0 done ex4 2025-10-11 10:03:45 +02:00
Joachim Bach
c360807ab5 added complementary option to pw3 2025-10-09 14:29:32 +02:00
Joachim Bach
770d6678ce changed question about minimum success rate 2025-10-05 19:45:23 +02:00
Joachim Bach
ec48bb1546 done ex3 2025-10-05 11:17:49 +02:00
Joachim Bach
aeb97473ea done ex2 2025-10-04 11:20:37 +02:00
Joachim Bach
a61e837b26 done ex1 2025-10-02 15:59:25 +02:00
gabriel.marinoja
a71160f863 Merge branch 'master' of gitlab.com:ml-mse1-hesso/PracticalWorks 2025-10-01 17:59:38 +02:00
gabriel.marinoja
b72020d0f7 feat: added PW3 2025-10-01 17:58:36 +02:00
Charline Aviolat
a31eea177c Upload New File 2025-09-28 10:01:28 +00:00
10 changed files with 23587 additions and 0 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

100
PW-3/ex1/ex1-data-test.csv Normal file
View File

@@ -0,0 +1,100 @@
39.1963341568658,78.53029405902203,0
40.448499233673424,86.83946993295656,1
65.57192032694599,44.303496565835594,0
79.64811329486565,70.8065641864705,1
66.26022052135889,41.67270317074954,0
97.6637443782087,68.3249232452966,1
30.548823788843436,57.31847952965393,0
89.47322095778219,85.94680780258534,1
50.93087801180052,34.2357678392285,0
39.79292275937423,83.42467462939659,1
47.45440952767612,43.40242137611206,0
69.97497171303611,84.4084067760751,1
66.57906119077748,42.13570922437346,0
85.05872976046471,54.31025004023918,1
66.50445545099684,46.515380367647104,0
75.67274744410004,93.79012528285647,1
30.589637766842877,71.58841488039977,0
43.2174833244174,83.55961536494472,1
58.04023606927604,39.47235992846592,0
40.15801957067056,94.28873609786281,1
65.40785754453304,39.872039582416946,0
58.25386824923051,64.96454852577446,1
90.05150698066501,34.03096751205591,0
72.24873848000416,90.1077757094509,1
32.732305095404456,98.49269418173134,0
74.06410532697512,66.96252809184301,1
30.074888412046263,56.513104954256875,0
87.57197590933474,68.15013081653733,1
54.562040422189284,49.542441977062865,0
78.30902280632358,72.23271250670665,1
57.870305028845,48.514216465966285,0
91.35751201085463,85.6201641726489,1
32.89942225933118,68.89835152862396,0
75.96271751468554,73.37079167632794,1
49.73784613458287,59.13494209712587,0
73.5544567377702,66.04140381033584,1
34.20510941997501,72.62513617755425,0
54.49230689236608,75.50968920375037,1
48.50711697988822,47.74600670205531,0
92.3876668476141,76.82950398511272,1
39.89720264828788,62.09872615693186,0
75.76883065897587,43.6375457580161,1
32.938859931422954,75.6959591164835,0
44.53335294213268,86.44202248365731,1
51.265631719309845,60.12130845234037,0
70.78776945843022,84.2462083261098,1
28.94644639193278,39.599160546805116,0
47.53708530844937,73.62887169594207,1
49.02408652102979,48.50397486087145,0
78.37067490088779,93.91476948225585,1
48.806979396137145,62.206605350437144,0
72.03919354554785,88.5636216577281,1
31.23633606784064,96.30534895479137,0
51.56156298671939,89.15548481990747,1
65.08996501958059,39.488228986986606,0
81.75983894249494,47.952028645978714,1
46.466982795222684,43.17493123886225,0
64.49601863360589,82.20819682836424,1
65.59947425235588,42.79658543523777,0
50.66778894002708,64.22662181783375,1
30.665280235026138,42.70685221873931,0
76.60228200416394,65.62163965042933,1
60.39824874786827,38.54265995207925,0
80.7498890348191,47.942468664004934,1
81.83730756343084,39.62946723071423,0
76.67188156208798,73.0039571691345,1
31.702591304883626,73.4485451232566,0
89.75853252236888,65.1794033434368,1
31.111272744640324,77.90680809560692,0
56.360076920020845,68.81541270666031,1
47.365528695867354,59.268265092300844,0
81.99701278469126,55.477765254828924,1
73.19627144242138,28.399910031060564,0
50.28593379220375,85.68597173591368,1
30.532888808836397,77.17395841411421,0
66.62736064332904,65.14099834530835,1
30.563843972698294,44.15958836055778,0
69.30483520344725,90.15732087213348,1
40.63104177166124,61.47155968946135,0
67.51887729702649,76.70896125160789,1
33.6944962783859,43.961979616998335,0
54.61941030575024,73.60040410454849,1
29.956247697479498,91.60028497230863,0
59.56176709683286,81.89054923262506,1
29.097516205452173,92.0159604576793,0
87.75444054660184,65.2841177353011,1
79.14696413604753,40.118482227299694,0
74.48492746059782,92.34246943037195,1
26.332352061636747,44.9551699040027,0
54.346942016509146,58.43293962287077,1
29.947060203169244,93.06082834209418,0
96.32633710641187,64.80350360838675,1
29.864465690194475,73.11550264372423,0
62.2263271267271,57.84956855286749,1
35.2611254453108,72.85531587549292,0
47.340681257438895,69.41232032562911,1
63.19534209968015,36.963350930620166,0
59.46464897992196,72.40245846384263,1
60.08389682243888,42.48638233127113,0
57.45295498601704,73.67928309399463,1
1 39.1963341568658 78.53029405902203 0
2 40.448499233673424 86.83946993295656 1
3 65.57192032694599 44.303496565835594 0
4 79.64811329486565 70.8065641864705 1
5 66.26022052135889 41.67270317074954 0
6 97.6637443782087 68.3249232452966 1
7 30.548823788843436 57.31847952965393 0
8 89.47322095778219 85.94680780258534 1
9 50.93087801180052 34.2357678392285 0
10 39.79292275937423 83.42467462939659 1
11 47.45440952767612 43.40242137611206 0
12 69.97497171303611 84.4084067760751 1
13 66.57906119077748 42.13570922437346 0
14 85.05872976046471 54.31025004023918 1
15 66.50445545099684 46.515380367647104 0
16 75.67274744410004 93.79012528285647 1
17 30.589637766842877 71.58841488039977 0
18 43.2174833244174 83.55961536494472 1
19 58.04023606927604 39.47235992846592 0
20 40.15801957067056 94.28873609786281 1
21 65.40785754453304 39.872039582416946 0
22 58.25386824923051 64.96454852577446 1
23 90.05150698066501 34.03096751205591 0
24 72.24873848000416 90.1077757094509 1
25 32.732305095404456 98.49269418173134 0
26 74.06410532697512 66.96252809184301 1
27 30.074888412046263 56.513104954256875 0
28 87.57197590933474 68.15013081653733 1
29 54.562040422189284 49.542441977062865 0
30 78.30902280632358 72.23271250670665 1
31 57.870305028845 48.514216465966285 0
32 91.35751201085463 85.6201641726489 1
33 32.89942225933118 68.89835152862396 0
34 75.96271751468554 73.37079167632794 1
35 49.73784613458287 59.13494209712587 0
36 73.5544567377702 66.04140381033584 1
37 34.20510941997501 72.62513617755425 0
38 54.49230689236608 75.50968920375037 1
39 48.50711697988822 47.74600670205531 0
40 92.3876668476141 76.82950398511272 1
41 39.89720264828788 62.09872615693186 0
42 75.76883065897587 43.6375457580161 1
43 32.938859931422954 75.6959591164835 0
44 44.53335294213268 86.44202248365731 1
45 51.265631719309845 60.12130845234037 0
46 70.78776945843022 84.2462083261098 1
47 28.94644639193278 39.599160546805116 0
48 47.53708530844937 73.62887169594207 1
49 49.02408652102979 48.50397486087145 0
50 78.37067490088779 93.91476948225585 1
51 48.806979396137145 62.206605350437144 0
52 72.03919354554785 88.5636216577281 1
53 31.23633606784064 96.30534895479137 0
54 51.56156298671939 89.15548481990747 1
55 65.08996501958059 39.488228986986606 0
56 81.75983894249494 47.952028645978714 1
57 46.466982795222684 43.17493123886225 0
58 64.49601863360589 82.20819682836424 1
59 65.59947425235588 42.79658543523777 0
60 50.66778894002708 64.22662181783375 1
61 30.665280235026138 42.70685221873931 0
62 76.60228200416394 65.62163965042933 1
63 60.39824874786827 38.54265995207925 0
64 80.7498890348191 47.942468664004934 1
65 81.83730756343084 39.62946723071423 0
66 76.67188156208798 73.0039571691345 1
67 31.702591304883626 73.4485451232566 0
68 89.75853252236888 65.1794033434368 1
69 31.111272744640324 77.90680809560692 0
70 56.360076920020845 68.81541270666031 1
71 47.365528695867354 59.268265092300844 0
72 81.99701278469126 55.477765254828924 1
73 73.19627144242138 28.399910031060564 0
74 50.28593379220375 85.68597173591368 1
75 30.532888808836397 77.17395841411421 0
76 66.62736064332904 65.14099834530835 1
77 30.563843972698294 44.15958836055778 0
78 69.30483520344725 90.15732087213348 1
79 40.63104177166124 61.47155968946135 0
80 67.51887729702649 76.70896125160789 1
81 33.6944962783859 43.961979616998335 0
82 54.61941030575024 73.60040410454849 1
83 29.956247697479498 91.60028497230863 0
84 59.56176709683286 81.89054923262506 1
85 29.097516205452173 92.0159604576793 0
86 87.75444054660184 65.2841177353011 1
87 79.14696413604753 40.118482227299694 0
88 74.48492746059782 92.34246943037195 1
89 26.332352061636747 44.9551699040027 0
90 54.346942016509146 58.43293962287077 1
91 29.947060203169244 93.06082834209418 0
92 96.32633710641187 64.80350360838675 1
93 29.864465690194475 73.11550264372423 0
94 62.2263271267271 57.84956855286749 1
95 35.2611254453108 72.85531587549292 0
96 47.340681257438895 69.41232032562911 1
97 63.19534209968015 36.963350930620166 0
98 59.46464897992196 72.40245846384263 1
99 60.08389682243888 42.48638233127113 0
100 57.45295498601704 73.67928309399463 1

100
PW-3/ex1/ex1-data-train.csv Normal file
View File

@@ -0,0 +1,100 @@
34.62365962451697,78.0246928153624,0
30.28671076822607,43.89499752400101,0
35.84740876993872,72.90219802708364,0
60.18259938620976,86.30855209546826,1
79.0327360507101,75.3443764369103,1
45.08327747668339,56.3163717815305,0
61.10666453684766,96.51142588489624,1
75.02474556738889,46.55401354116538,1
76.09878670226257,87.42056971926803,1
84.43281996120035,43.53339331072109,1
95.86155507093572,38.22527805795094,0
75.01365838958247,30.60326323428011,0
82.30705337399482,76.48196330235604,1
69.36458875970939,97.71869196188608,1
39.53833914367223,76.03681085115882,0
53.9710521485623,89.20735013750205,1
69.07014406283025,52.74046973016765,1
67.94685547711617,46.67857410673128,0
70.66150955499435,92.92713789364831,1
76.97878372747498,47.57596364975532,1
67.37202754570876,42.83843832029179,0
89.67677575072079,65.79936592745237,1
50.534788289883,48.85581152764205,0
34.21206097786789,44.20952859866288,0
77.9240914545704,68.9723599933059,1
62.27101367004632,69.95445795447587,1
80.1901807509566,44.82162893218353,1
93.114388797442,38.80067033713209,0
61.83020602312595,50.25610789244621,0
38.78580379679423,64.99568095539578,0
61.379289447425,72.80788731317097,1
85.40451939411645,57.05198397627122,1
52.10797973193984,63.12762376881715,0
52.04540476831827,69.43286012045222,1
40.23689373545111,71.16774802184875,0
54.63510555424817,52.21388588061123,0
33.91550010906887,98.86943574220611,0
64.17698887494485,80.90806058670817,1
74.78925295941542,41.57341522824434,0
34.1836400264419,75.2377203360134,0
83.90239366249155,56.30804621605327,1
51.54772026906181,46.85629026349976,0
94.44336776917852,65.56892160559052,1
82.36875375713919,40.61825515970618,0
51.04775177128865,45.82270145776001,0
62.22267576120188,52.06099194836679,0
77.19303492601364,70.45820000180959,1
97.77159928000232,86.7278223300282,1
62.07306379667647,96.76882412413983,1
91.56497449807442,88.69629254546599,1
79.94481794066932,74.16311935043758,1
99.2725269292572,60.99903099844988,1
90.54671411399852,43.39060180650027,1
34.52451385320009,60.39634245837173,0
50.2864961189907,49.80453881323059,0
49.58667721632031,59.80895099453265,0
97.64563396007767,68.86157272420604,1
32.57720016809309,95.59854761387875,0
74.24869136721598,69.82457122657193,1
71.79646205863379,78.45356224515052,1
75.3956114656803,85.75993667331619,1
35.28611281526193,47.02051394723416,0
56.25381749711624,39.26147251058019,0
30.05882244669796,49.59297386723685,0
44.66826172480893,66.45008614558913,0
66.56089447242954,41.09209807936973,0
40.45755098375164,97.53518548909936,1
49.07256321908844,51.88321182073966,0
80.27957401466998,92.11606081344084,1
66.74671856944039,60.99139402740988,1
32.72283304060323,43.30717306430063,0
64.0393204150601,78.03168802018232,1
72.34649422579923,96.22759296761404,1
60.45788573918959,73.09499809758037,1
58.84095621726802,75.85844831279042,1
99.82785779692128,72.36925193383885,1
47.26426910848174,88.47586499559782,1
50.45815980285988,75.80985952982456,1
60.45555629271532,42.50840943572217,0
82.22666157785568,42.71987853716458,0
88.9138964166533,69.80378889835472,1
94.83450672430196,45.69430680250754,1
67.31925746917527,66.58935317747915,1
57.23870631569862,59.51428198012956,1
80.36675600171273,90.96014789746954,1
68.46852178591112,85.59430710452014,1
42.0754545384731,78.84478600148043,0
75.47770200533905,90.42453899753964,1
78.63542434898018,96.64742716885644,1
52.34800398794107,60.76950525602592,0
94.09433112516793,77.15910509073893,1
90.44855097096364,87.50879176484702,1
55.48216114069585,35.57070347228866,0
74.49269241843041,84.84513684930135,1
89.84580670720979,45.35828361091658,1
83.48916274498238,48.38028579728175,1
42.2617008099817,87.10385094025457,1
99.31500880510394,68.77540947206617,1
55.34001756003703,64.9319380069486,1
74.77589300092767,89.52981289513276,1
1 34.62365962451697 78.0246928153624 0
2 30.28671076822607 43.89499752400101 0
3 35.84740876993872 72.90219802708364 0
4 60.18259938620976 86.30855209546826 1
5 79.0327360507101 75.3443764369103 1
6 45.08327747668339 56.3163717815305 0
7 61.10666453684766 96.51142588489624 1
8 75.02474556738889 46.55401354116538 1
9 76.09878670226257 87.42056971926803 1
10 84.43281996120035 43.53339331072109 1
11 95.86155507093572 38.22527805795094 0
12 75.01365838958247 30.60326323428011 0
13 82.30705337399482 76.48196330235604 1
14 69.36458875970939 97.71869196188608 1
15 39.53833914367223 76.03681085115882 0
16 53.9710521485623 89.20735013750205 1
17 69.07014406283025 52.74046973016765 1
18 67.94685547711617 46.67857410673128 0
19 70.66150955499435 92.92713789364831 1
20 76.97878372747498 47.57596364975532 1
21 67.37202754570876 42.83843832029179 0
22 89.67677575072079 65.79936592745237 1
23 50.534788289883 48.85581152764205 0
24 34.21206097786789 44.20952859866288 0
25 77.9240914545704 68.9723599933059 1
26 62.27101367004632 69.95445795447587 1
27 80.1901807509566 44.82162893218353 1
28 93.114388797442 38.80067033713209 0
29 61.83020602312595 50.25610789244621 0
30 38.78580379679423 64.99568095539578 0
31 61.379289447425 72.80788731317097 1
32 85.40451939411645 57.05198397627122 1
33 52.10797973193984 63.12762376881715 0
34 52.04540476831827 69.43286012045222 1
35 40.23689373545111 71.16774802184875 0
36 54.63510555424817 52.21388588061123 0
37 33.91550010906887 98.86943574220611 0
38 64.17698887494485 80.90806058670817 1
39 74.78925295941542 41.57341522824434 0
40 34.1836400264419 75.2377203360134 0
41 83.90239366249155 56.30804621605327 1
42 51.54772026906181 46.85629026349976 0
43 94.44336776917852 65.56892160559052 1
44 82.36875375713919 40.61825515970618 0
45 51.04775177128865 45.82270145776001 0
46 62.22267576120188 52.06099194836679 0
47 77.19303492601364 70.45820000180959 1
48 97.77159928000232 86.7278223300282 1
49 62.07306379667647 96.76882412413983 1
50 91.56497449807442 88.69629254546599 1
51 79.94481794066932 74.16311935043758 1
52 99.2725269292572 60.99903099844988 1
53 90.54671411399852 43.39060180650027 1
54 34.52451385320009 60.39634245837173 0
55 50.2864961189907 49.80453881323059 0
56 49.58667721632031 59.80895099453265 0
57 97.64563396007767 68.86157272420604 1
58 32.57720016809309 95.59854761387875 0
59 74.24869136721598 69.82457122657193 1
60 71.79646205863379 78.45356224515052 1
61 75.3956114656803 85.75993667331619 1
62 35.28611281526193 47.02051394723416 0
63 56.25381749711624 39.26147251058019 0
64 30.05882244669796 49.59297386723685 0
65 44.66826172480893 66.45008614558913 0
66 66.56089447242954 41.09209807936973 0
67 40.45755098375164 97.53518548909936 1
68 49.07256321908844 51.88321182073966 0
69 80.27957401466998 92.11606081344084 1
70 66.74671856944039 60.99139402740988 1
71 32.72283304060323 43.30717306430063 0
72 64.0393204150601 78.03168802018232 1
73 72.34649422579923 96.22759296761404 1
74 60.45788573918959 73.09499809758037 1
75 58.84095621726802 75.85844831279042 1
76 99.82785779692128 72.36925193383885 1
77 47.26426910848174 88.47586499559782 1
78 50.45815980285988 75.80985952982456 1
79 60.45555629271532 42.50840943572217 0
80 82.22666157785568 42.71987853716458 0
81 88.9138964166533 69.80378889835472 1
82 94.83450672430196 45.69430680250754 1
83 67.31925746917527 66.58935317747915 1
84 57.23870631569862 59.51428198012956 1
85 80.36675600171273 90.96014789746954 1
86 68.46852178591112 85.59430710452014 1
87 42.0754545384731 78.84478600148043 0
88 75.47770200533905 90.42453899753964 1
89 78.63542434898018 96.64742716885644 1
90 52.34800398794107 60.76950525602592 0
91 94.09433112516793 77.15910509073893 1
92 90.44855097096364 87.50879176484702 1
93 55.48216114069585 35.57070347228866 0
94 74.49269241843041 84.84513684930135 1
95 89.84580670720979 45.35828361091658 1
96 83.48916274498238 48.38028579728175 1
97 42.2617008099817 87.10385094025457 1
98 99.31500880510394 68.77540947206617 1
99 55.34001756003703 64.9319380069486 1
100 74.77589300092767 89.52981289513276 1

View File

@@ -0,0 +1,745 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "bcf79585",
"metadata": {},
"source": [
"# Exercice 2 - System evaluation"
]
},
{
"cell_type": "markdown",
"id": "f642cedb",
"metadata": {},
"source": [
"## Imports"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "9421a4e1",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"id": "a0d67fa6",
"metadata": {},
"source": [
"## Load data"
]
},
{
"cell_type": "markdown",
"id": "5fe90672",
"metadata": {},
"source": [
"Define the path of the data file"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "ecd4a4cf",
"metadata": {},
"outputs": [],
"source": [
"path = \"ex2-system-a.csv\""
]
},
{
"cell_type": "markdown",
"id": "246e7392",
"metadata": {},
"source": [
"Read the CSV file using `read_csv`"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "623096a5",
"metadata": {},
"outputs": [],
"source": [
"dataset_a = pd.read_csv(path, sep=\";\", index_col=False, names=[\"0\", \"1\", \"2\", \"3\", \"4\", \"5\", \"6\", \"7\", \"8\", \"9\", \"y_true\"])"
]
},
{
"cell_type": "markdown",
"id": "6f764c56",
"metadata": {},
"source": [
"Display first rows"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "c59a1651",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" <th>y_true</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5.348450e-08</td>\n",
" <td>7.493480e-10</td>\n",
" <td>8.083470e-07</td>\n",
" <td>2.082290e-05</td>\n",
" <td>5.222360e-10</td>\n",
" <td>2.330260e-08</td>\n",
" <td>5.241270e-12</td>\n",
" <td>9.999650e-01</td>\n",
" <td>4.808590e-07</td>\n",
" <td>0.000013</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.334270e-03</td>\n",
" <td>3.202960e-05</td>\n",
" <td>8.504280e-01</td>\n",
" <td>1.669090e-03</td>\n",
" <td>1.546460e-07</td>\n",
" <td>2.412940e-04</td>\n",
" <td>1.448280e-01</td>\n",
" <td>1.122810e-11</td>\n",
" <td>1.456330e-03</td>\n",
" <td>0.000011</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.643050e-06</td>\n",
" <td>9.962760e-01</td>\n",
" <td>2.045910e-03</td>\n",
" <td>4.210530e-04</td>\n",
" <td>2.194020e-05</td>\n",
" <td>1.644130e-05</td>\n",
" <td>2.838160e-04</td>\n",
" <td>3.722960e-04</td>\n",
" <td>5.150120e-04</td>\n",
" <td>0.000044</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>9.998200e-01</td>\n",
" <td>2.550390e-10</td>\n",
" <td>1.112010e-05</td>\n",
" <td>1.653200e-05</td>\n",
" <td>5.375730e-10</td>\n",
" <td>8.999750e-05</td>\n",
" <td>9.380920e-06</td>\n",
" <td>4.464470e-05</td>\n",
" <td>2.418440e-06</td>\n",
" <td>0.000006</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2.092460e-08</td>\n",
" <td>7.464220e-08</td>\n",
" <td>3.560820e-05</td>\n",
" <td>5.496200e-07</td>\n",
" <td>9.988960e-01</td>\n",
" <td>3.070920e-08</td>\n",
" <td>2.346150e-04</td>\n",
" <td>9.748010e-07</td>\n",
" <td>1.071610e-06</td>\n",
" <td>0.000831</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 4 \\\n",
"0 5.348450e-08 7.493480e-10 8.083470e-07 2.082290e-05 5.222360e-10 \n",
"1 1.334270e-03 3.202960e-05 8.504280e-01 1.669090e-03 1.546460e-07 \n",
"2 3.643050e-06 9.962760e-01 2.045910e-03 4.210530e-04 2.194020e-05 \n",
"3 9.998200e-01 2.550390e-10 1.112010e-05 1.653200e-05 5.375730e-10 \n",
"4 2.092460e-08 7.464220e-08 3.560820e-05 5.496200e-07 9.988960e-01 \n",
"\n",
" 5 6 7 8 9 y_true \n",
"0 2.330260e-08 5.241270e-12 9.999650e-01 4.808590e-07 0.000013 7 \n",
"1 2.412940e-04 1.448280e-01 1.122810e-11 1.456330e-03 0.000011 2 \n",
"2 1.644130e-05 2.838160e-04 3.722960e-04 5.150120e-04 0.000044 1 \n",
"3 8.999750e-05 9.380920e-06 4.464470e-05 2.418440e-06 0.000006 0 \n",
"4 3.070920e-08 2.346150e-04 9.748010e-07 1.071610e-06 0.000831 4 "
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset_a.head()"
]
},
{
"cell_type": "markdown",
"id": "41f040b0",
"metadata": {},
"source": [
"Store some useful statistics (class names + number of classes)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "fd0adce4",
"metadata": {},
"outputs": [],
"source": [
"class_names = [\"0\", \"1\", \"2\", \"3\", \"4\", \"5\", \"6\", \"7\", \"8\", \"9\"]\n",
"nb_classes = len(class_names)"
]
},
{
"cell_type": "markdown",
"id": "5a0ab85a",
"metadata": {},
"source": [
"## Exercise's steps"
]
},
{
"cell_type": "markdown",
"id": "66ae582e",
"metadata": {},
"source": [
"a) Write a function to take classification decisions on such outputs according to Bayesrule."
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "3c36b377",
"metadata": {},
"outputs": [],
"source": [
"def bayes_classification(df):\n",
" \"\"\"\n",
" Take classification decisions according to Bayes rule.\n",
"\n",
" Parameters\n",
" ----------\n",
" df : Pandas DataFrame of shape (n_samples, n_features + ground truth)\n",
" Dataset.\n",
"\n",
" Returns\n",
" -------\n",
" preds : Numpy array of shape (n_samples,)\n",
" Class labels for each data sample.\n",
" \"\"\"\n",
" y_pred = []\n",
" for i in range(df.shape[0]):\n",
" index = np.argmax(df.iloc[i,:10]) # take all the line except the y value\n",
" y_pred.append(index)\n",
" \n",
" return y_pred\n"
]
},
{
"cell_type": "markdown",
"id": "b5e8140b",
"metadata": {},
"source": [
"b) What is the overall error rate of the system ?"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "f3b21bfb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error rate = 0.10729999999999995\n"
]
}
],
"source": [
"# Your code here: compute and print the error rate of the system\n",
"y_pred_a = bayes_classification(dataset_a)\n",
"\n",
"correct = 0\n",
"for i in range(0, len(y_pred_a)):\n",
" if(dataset_a.iloc[i,10] == y_pred_a[i]):\n",
" correct += 1\n",
"\n",
"success = correct/len(y_pred_a)\n",
"print(f\"Error rate = {1-success}\")"
]
},
{
"cell_type": "markdown",
"id": "a4f0fa5f",
"metadata": {},
"source": [
"c) Compute and report the confusion matrix of the system."
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "bb106415",
"metadata": {},
"outputs": [],
"source": [
"def confusion_matrix(y_true, y_pred, n_classes):\n",
" \"\"\"\n",
" Compute the confusion matrix.\n",
" \n",
" Parameters\n",
" ----------\n",
" y_true : Numpy array of shape (n_samples,)\n",
" Ground truth.\n",
" y_pred : Numpy array of shape (n_samples,)\n",
" Predictions.\n",
" n_classes : Integer\n",
" Number of classes.\n",
" \n",
" Returns\n",
" -------\n",
" cm : Numpy array of shape (n_classes, n_classes)\n",
" Confusion matrix.\n",
" \"\"\"\n",
" matrix = np.zeros((n_classes, n_classes))\n",
"\n",
" for i in range(0, len(y_pred)):\n",
" matrix[y_true[i], y_pred[i]] += 1 \n",
"\n",
" return matrix"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "1b38e3a8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0 1 2 3 4 5 6 7 8 9\n",
" 0 | 944 0 11 0 0 2 10 7 5 1\n",
" 1 | 0 1112 2 3 1 4 3 1 9 0\n",
" 2 | 10 6 921 12 15 3 19 15 26 5\n",
"t 3 | 1 1 31 862 2 72 5 14 12 10\n",
"r 4 | 2 3 6 2 910 1 12 6 4 36\n",
"u 5 | 12 3 6 29 19 768 19 9 21 6\n",
"e 6 | 14 3 21 2 22 28 865 0 3 0\n",
" 7 | 0 14 30 9 7 2 1 929 3 33\n",
" 8 | 12 16 18 26 24 46 22 19 772 19\n",
" 9 | 10 4 6 22 53 18 0 48 4 844\n",
" predicted \n"
]
}
],
"source": [
"# Your code here: compute and print the confusion matrix\n",
"\n",
"cm_a = confusion_matrix(dataset_a.iloc[:,10], y_pred_a, nb_classes)\n",
"\n",
"#headers\n",
"print(\" \", end=\"\")\n",
"for j in range(nb_classes):\n",
" print(f\"{j:5d}\", end=\"\")\n",
"print()\n",
"\n",
"#rows\n",
"for i in range(nb_classes):\n",
" match i:\n",
" case 3:\n",
" print(\"t\", end=\"\")\n",
" case 4:\n",
" print(\"r\", end=\"\")\n",
" case 5:\n",
" print(\"u\", end=\"\")\n",
" case 6:\n",
" print(\"e\", end=\"\")\n",
" case _:\n",
" print(\" \", end=\"\")\n",
"\n",
" print(f\"{i:3d} |\", end=\"\")\n",
" for j in range(nb_classes):\n",
" print(f\"{int(cm_a[i, j]):5d}\", end=\"\")\n",
"\n",
" print()\n",
"\n",
"\n",
"print(\" predicted \")\n",
"# print(cm.astype(int))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0cf5380f",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "ed8db908",
"metadata": {},
"source": [
"d) What are the worst and best classes in terms of precision and recall ?"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "0e229ce0",
"metadata": {},
"outputs": [],
"source": [
"def precision_per_class(cm):\n",
" \"\"\"\n",
" Compute the precision per class.\n",
" \n",
" Parameters\n",
" ----------\n",
" cm : Numpy array of shape (n_classes, n_classes)\n",
" Confusion matrix.\n",
" \n",
" Returns\n",
" -------\n",
" precisions : Numpy array of shape (n_classes,)\n",
" Precision per class.\n",
" \"\"\"\n",
" rates = []\n",
" for i in range(cm.shape[1]):\n",
" correct = cm[i,i]\n",
" incorrect = 0\n",
" for j in range(cm.shape[0]):\n",
" if i != j:\n",
" incorrect += cm[j,i]\n",
"\n",
" rates.append(correct/(correct+incorrect))\n",
"\n",
" return rates\n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "95325772",
"metadata": {},
"outputs": [],
"source": [
"def recall_per_class(cm):\n",
" \"\"\"\n",
" Compute the recall per class.\n",
" \n",
" Parameters\n",
" ----------\n",
" cm : Numpy array of shape (n_classes, n_classes)\n",
" Confusion matrix.\n",
" \n",
" Returns\n",
" -------\n",
" recalls : Numpy array of shape (n_classes,)\n",
" Recall per class.\n",
" \"\"\"\n",
" rates = []\n",
" for i in range(cm.shape[0]):\n",
" correct = cm[i,i]\n",
" incorrect = 0\n",
" for j in range(cm.shape[1]):\n",
" if i != j:\n",
" incorrect += cm[i,j]\n",
"\n",
" rates.append(correct/(correct+incorrect))\n",
"\n",
" return rates"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "a0fb19e3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Class 0, precision = 0.9393034825870646\n",
"Class 1, precision = 0.9569707401032702\n",
"Class 2, precision = 0.8754752851711026\n",
"Class 3, precision = 0.8914167528438469\n",
"Class 4, precision = 0.8641975308641975\n",
"Class 5, precision = 0.8135593220338984\n",
"Class 6, precision = 0.9048117154811716\n",
"Class 7, precision = 0.8864503816793893\n",
"Class 8, precision = 0.8987194412107101\n",
"Class 9, precision = 0.8846960167714885\n",
"\n",
"Best = class 1, 0.9569707401032702\n",
"Worst = class 5, 0.8135593220338984\n"
]
}
],
"source": [
"# Your code here: find and print the worst and best classes in terms of precision\n",
"precision_a = precision_per_class(cm_a)\n",
"\n",
"for i in range(len(precision_a)):\n",
" print(f\"Class {i}, precision = {precision_a[i]}\")\n",
"\n",
"print(\"\")\n",
"\n",
"print(f\"Best = class {np.argmax(precision_a)}, {precision_a[np.argmax(precision_a)]}\")\n",
"print(f\"Worst = class {np.argmin(precision_a)}, {precision_a[np.argmin(precision_a)]}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "42c3edd8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Class 0, recall = 0.963265306122449\n",
"Class 1, recall = 0.9797356828193833\n",
"Class 2, recall = 0.8924418604651163\n",
"Class 3, recall = 0.8534653465346534\n",
"Class 4, recall = 0.9266802443991853\n",
"Class 5, recall = 0.8609865470852018\n",
"Class 6, recall = 0.9029227557411273\n",
"Class 7, recall = 0.9036964980544747\n",
"Class 8, recall = 0.7926078028747433\n",
"Class 9, recall = 0.8364717542120912\n",
"\n",
"Best = class 1, 0.9797356828193833\n",
"Worst = class 8, 0.7926078028747433\n"
]
}
],
"source": [
"# Your code here: find and print the worst and best classes in terms of recall\n",
"\n",
"recall_a = recall_per_class(cm_a)\n",
"\n",
"for i in range(len(recall_a)):\n",
" print(f\"Class {i}, recall = {recall_a[i]}\")\n",
"\n",
"print(\"\")\n",
"\n",
"print(f\"Best = class {np.argmax(recall_a)}, {recall_a[np.argmax(recall_a)]}\")\n",
"print(f\"Worst = class {np.argmin(recall_a)}, {recall_a[np.argmin(recall_a)]}\")\n"
]
},
{
"cell_type": "markdown",
"id": "7ac6fe5d",
"metadata": {},
"source": [
"e) In file `ex1-system-b.csv` you find the output of a second system B. What is the best system between (a) and (b) in terms of error rate and F1."
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "b98c2545",
"metadata": {},
"outputs": [],
"source": [
"# Your code here: load the data of the system B\n",
"path = \"ex2-system-b.csv\"\n",
"dataset_b = pd.read_csv(path, sep=\";\", index_col=False, names=[\"0\", \"1\", \"2\", \"3\", \"4\", \"5\", \"6\", \"7\", \"8\", \"9\", \"y_true\"])\n"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "050091b9",
"metadata": {},
"outputs": [],
"source": [
"def system_accuracy(cm):\n",
" \"\"\"\n",
" Compute the system accuracy.\n",
" \n",
" Parameters\n",
" ----------\n",
" cm : Numpy array of shape (n_classes, n_classes)\n",
" Confusion matrix.\n",
" \n",
" Returns\n",
" -------\n",
" accuracy : Float\n",
" Accuracy of the system.\n",
" \"\"\"\n",
"\n",
" diag = 0\n",
" for i in range(cm.shape[0]):\n",
" diag += cm[i,i]\n",
"\n",
" acc = diag / np.sum(cm)\n",
" return acc"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "adc0f138",
"metadata": {},
"outputs": [],
"source": [
"def system_f1_score(cm):\n",
" \"\"\"\n",
" Compute the system F1 score.\n",
" \n",
" Parameters\n",
" ----------\n",
" cm : Numpy array of shape (n_classes, n_classes)\n",
" Confusion matrix.\n",
" \n",
" Returns\n",
" -------\n",
" f1_score : Float\n",
" F1 score of the system.\n",
" \"\"\"\n",
"\n",
" f1 = []\n",
" precision = precision_per_class(cm)\n",
" recall = recall_per_class(cm)\n",
"\n",
" for i in range(0, len(precision)):\n",
" f1.append(2*((precision[i] * recall[i])/(precision[i] + recall[i])))\n",
" return np.sum(f1)/len(f1)\n"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "f1385c87",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"System A accuracy = 0.8927\n",
"System A f1 = 0.8907308492877297\n"
]
}
],
"source": [
"# Your code here: compute and print the accuracy and the F1 score of the system A\n",
"\n",
"acc_a = system_accuracy(cm_a)\n",
"print(f\"System A accuracy = {acc_a}\")\n",
"\n",
"f1_a = system_f1_score(cm_a)\n",
"\n",
"print(f\"System A f1 = {f1_a}\")"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "50c64d08",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"System A accuracy = 0.9613\n",
"System A f1 = 0.9608568150389065\n"
]
}
],
"source": [
"# Your code here: compute and print the accuracy and the F1 score of the system B\n",
"y_pred_b = bayes_classification(dataset_b)\n",
"cm_b = confusion_matrix(dataset_b.iloc[:,10], y_pred_b, nb_classes)\n",
"\n",
"acc_b = system_accuracy(cm_b)\n",
"print(f\"System A accuracy = {acc_b}\")\n",
"\n",
"f1_b = system_f1_score(cm_b)\n",
"\n",
"print(f\"System A f1 = {f1_b}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

10000
PW-3/ex2/ex2-system-a.csv Normal file

File diff suppressed because it is too large Load Diff

10000
PW-3/ex2/ex2-system-b.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,317 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "74682f1a",
"metadata": {},
"source": [
"# Done by Aviolat Charline, Bach Joachim and Marino Gabriel"
]
},
{
"cell_type": "markdown",
"id": "ad0d40d6",
"metadata": {},
"source": [
"# Exercice 3 - Review questions"
]
},
{
"cell_type": "markdown",
"id": "3e556a9d",
"metadata": {},
"source": [
"**a) Assuming an univariate input *x*, what is the complexity at inference time of a Bayesian classifier based on histogram computation of the likelihood ?**"
]
},
{
"cell_type": "markdown",
"id": "8d2fb7ef",
"metadata": {},
"source": [
"For each class, we must compute the likelyhood, which is one calculus per class, so O(nb_class). Then, for each x we must compute the posteriori probability, which is looking into a pre-computed histogram (done in the training phase), so this is O(nb_class). The a priori probability only needs to be computed for each class, so O(nb_class). So, the total complexity of the Bayesian classifier is O(2 * nb_class * nb_x)"
]
},
{
"cell_type": "markdown",
"id": "99632770",
"metadata": {},
"source": [
"**b) Bayesian models are said to be generative as they can be used to generate new samples. Taking the implementation of the exercise 1.a, explain the steps to generate new samples using the system you have put into place.**\n",
" "
]
},
{
"cell_type": "markdown",
"id": "88ab64b2",
"metadata": {},
"source": [
"To generate a new sample, we need to create a y and a x. This can be done by firstly picking the class Ck randomly according to the a priori probabilities P(Ck). This means that, if there is two classes and P(C1) = 0.6 and P(C2) = 0.4, we take 60% of the time C1 and 40% of the time C2\n",
"\n",
"Then, we can pick a random x based on the probability density function p(x|Ck). This means we choose a class and, in the density function (like histogram), we take a random x based on the probablilities. If there is two x values possibles and one is distributed as 0.4 and the other 0.6, we will taxe x1 40% of the time and x2 60%"
]
},
{
"cell_type": "markdown",
"id": "e2f611fe",
"metadata": {},
"source": [
"***Optional*: Provide an implementation in a function generateSample(priors, histValues, edgeValues, n)**"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "14aba0f7",
"metadata": {},
"outputs": [],
"source": [
"def generateSample(priors, histValues, edgeValues, n):\n",
" # pick a class according to the proba\n",
" # to do that, compute the different probabilities sum. This is done by creating intervals between 0 and 1. The size of those intervals represents the probability of the random\n",
" # number generator to land on it.\n",
" cumulative_probs = np.cumsum(priors)\n",
"\n",
" # take a random number and see in which interval it falls. The index of this interval will be the class we chose\n",
" chosen_class = 0\n",
" r = random.random() \n",
" for i, cp in enumerate(cumulative_probs):\n",
" if r < cp:\n",
" chosen_class = i\n",
" break\n",
" \n",
"\n",
" # The same logic is used to find the new x value. We take the proba of x given c and chose randomly weighted by those proba.\n",
" # we have to compute the \"probabilities\" differently, because the histogram is only the count of each x in the c.\n",
" # here, we kept the count instead of proba and when generating the random number, instead of chosing between 0 1 and 1 we chose between 0 and total_hist\n",
" # which does the same job in the end\n",
" total_hist = np.sum(histValues[chosen_class])\n",
"\n",
" cumulative_probs_hist = np.cumsum(histValues[chosen_class])\n",
"\n",
" # take a random number and see in which interval it falls. The index of this interval will be the class we chose\n",
" chosen_x_index = 0\n",
" r = random.uniform(0, total_hist) \n",
" for i, cp in enumerate(cumulative_probs_hist):\n",
" if r < cp:\n",
" chosen_x_index = i\n",
" break\n",
"\n",
" chosen_x = edgeValues[chosen_x_index]\n",
" \n"
]
},
{
"cell_type": "markdown",
"id": "ed8c4f6b",
"metadata": {},
"source": [
"**c) What is the minimum overall accuracy of a 2-class system relying only on priors and that is built on a training set that includes 5 times more samples in class A than in class B?**"
]
},
{
"cell_type": "markdown",
"id": "4bb03365",
"metadata": {},
"source": [
"If we only take the priors, then the posterior probability only depends on it. The system will chose the highest posterior probability, so the highest prior because it is all it has. This means it will always choose the class A. If the repartition of the test set is the same as the training set, then always choosing A will give a 5/6 success rate, which will be all the correct A and all the missed B. If the test set is balanced, the success rate will be 50% because it will find all the A and miss all the B. Finally, if the system is unbalanced in the other way, the success rate will only be the portion of the A class in comparaison to the B class. The absolute minimum is then how low the portion of A can be compared to B in the test set."
]
},
{
"cell_type": "markdown",
"id": "58450ff6",
"metadata": {},
"source": [
"**d) Lets look back at the PW02 exercise 3 of last week. We have built a knn classification systems for images of digits on the MNIST database.**\n",
"\n",
"**How would you build a Bayesian classification for the same task ? Comment on the prior probabilities and on the likelihood estimators. More specifically, what kind of likelihood estimator could we use in this case ?**"
]
},
{
"cell_type": "markdown",
"id": "d2bf1500",
"metadata": {},
"source": [
"The a priori probability is simply the repartition of each class in the dataset.\n",
"The likelihood is the tricky part, because the system would need to be multivariate (because of all the pixels), which makes it very complex. We could use the Naive Bayes formula witch states that the features (pixels) are completely uncorrelated and then we cound perform the operation pixel per pixel for each image. However, the pixels ARE NOT uncorrelated, because a pixel is spatially positionned. If a pixel is white, there is a strong change that there are white pixels somewhere around as well. The Naive Bayes would technically still work-ish, but with a false presomption.\n",
"\n",
"To do it correctly, we would have to use something like the multivariate gaussian distribution"
]
},
{
"cell_type": "markdown",
"id": "a3ca9715",
"metadata": {},
"source": [
"***Optional:* implement it and report performance !**"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "4de72736",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training data shape: (10000, 28, 28)\n",
"Training labels shape: (10000,)\n",
"Test data shape: (10000, 28, 28)\n",
"Test labels shape: (10000,)\n",
"(10000, 784) (10000, 784)\n",
"Accuracy score : 0.5711\n"
]
}
],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import os\n",
"\n",
"\n",
"# This is a method to read the MNIST dataset from a ROOT directory\n",
"def load_MNIST(ROOT):\n",
" '''load all of mnist\n",
" training set first'''\n",
" Xtr = []\n",
" train = pd.read_csv(os.path.join(ROOT, 'mnist_train.csv'))\n",
" X = np.array(train.drop('label', axis=1))\n",
" Ytr = np.array(train['label'])\n",
" # With this for-loop we give the data a shape of the acctual image (28x28)\n",
" # instead of the shape in file (1x784)\n",
" for row in X:\n",
" Xtr.append(row.reshape(28,28))\n",
" # load test set second\n",
" Xte = []\n",
" test = pd.read_csv(os.path.join(ROOT, 'mnist_test.csv'))\n",
" X = np.array(test.drop('label', axis=1))\n",
" Yte = np.array(test['label'])\n",
" # same reshaping\n",
" for row in X:\n",
" Xte.append(row.reshape(28,28))\n",
" \n",
" return np.array(Xtr), np.array(Ytr), np.array(Xte), np.array(Yte)\n",
"\n",
"# Load the raw MNIST data.\n",
"mnist_dir = '' \n",
"X_train, y_train, X_test, y_test = load_MNIST(mnist_dir)\n",
"\n",
"# As a sanity check, we print out the size of the training and test data.\n",
"print('Training data shape: ', X_train.shape)\n",
"print('Training labels shape: ', y_train.shape)\n",
"print('Test data shape: ', X_test.shape)\n",
"print('Test labels shape: ', y_test.shape)\n",
"X_train = np.reshape(X_train, (X_train.shape[0], -1)) \n",
"X_test = np.reshape(X_test, (X_test.shape[0], -1)) \n",
"\n",
"print(X_train.shape, X_test.shape)\n",
"def predict_gaussian(X_test, mu, sigma2, priors):\n",
" n_samples = X_test.shape[0]\n",
" y_pred = np.zeros(n_samples)\n",
"\n",
" K, n_pixels = mu.shape\n",
" \n",
" for idx in range(n_samples):\n",
" x = X_test[idx]\n",
" proba_classes = np.zeros(K)\n",
"\n",
" for c in range(K):\n",
" log_likelihood = -0.5 * np.log(2 * np.pi * sigma2[c]) - ((x - mu[c])**2) / (2 * sigma2[c])\n",
" proba_classes[c] = np.log(priors[c]) + np.sum(log_likelihood)\n",
" \n",
" y_pred[idx] = np.argmax(proba_classes)\n",
"\n",
" return y_pred\n",
"classes = np.unique(y_train)\n",
"priors = np.array([np.mean(y_train == c) for c in classes])\n",
"\n",
"n_pixels = X_train.shape[1]\n",
"\n",
"mu = np.zeros((len(classes), n_pixels))\n",
"sigma2 = np.zeros((len(classes), n_pixels))\n",
"\n",
"for c in classes:\n",
" X_c = X_train[y_train == c]\n",
" mu[c, :] = X_c.mean(axis=0)\n",
" sigma2[c, :] = X_c.var(axis=0) + 1e-5\n",
" \n",
"y_pred = predict_gaussian(X_test, mu, sigma2, priors)\n",
"accuracy = np.mean(y_pred == y_test)\n",
"\n",
"print(\"Accuracy score :\", accuracy)"
]
},
{
"cell_type": "markdown",
"id": "07cb7aee",
"metadata": {},
"source": [
"The .57 accuracy observed here might prove that the method is not the right one for this type of problems, because if each pixel is a feature, the number of dimensions become way to big. This might also be caused by the fact that pixels are not uncorrelated."
]
},
{
"cell_type": "markdown",
"id": "b812b46f",
"metadata": {},
"source": [
"**e) Read [europe-border-control-ai-lie-detector](https://theintercept.com/2019/07/26/europe-border-control-ai-lie-detector/). The described system is \"a virtual policeman designed to strengthen European borders\". It can be seen as a 2-class problem, either you are a suspicious traveler or you are not. If you are declared as suspicious by the system, you are routed to a human border agent who analyses your case in a more careful way.**\n",
"\n",
"1. What kind of errors can the system make ? Explain them in your own words.\n",
"2. Is one error more critical than the other ? Explain why.\n",
"3. According to the previous points, which metric would you recommend to tune your MLsystem ?"
]
},
{
"cell_type": "markdown",
"id": "1adf1760",
"metadata": {},
"source": [
"1. The system can make false positives or false negatives. This means it could say that an innocent man is a threat or that a dangerous person is safe to cross the border.\n",
"2. Yes, a false negative is the most critical one. In the case of a false positive, the only consequence is a lost of time because you have to interrogate the \"suspect\", maybe resulting in a angry customer. On the other hand, a false negative means a real threat has entered the country and has not been detect, wich could have way more concequences than an angry customer.\n",
"3. In this case, we could use the Area Under the Curve with this system. This would allow to tune the the treshold and impact the decision to tend more to false positives rather than false negatives"
]
},
{
"cell_type": "markdown",
"id": "195a1f73-c0f7-4707-9551-c71bfa379960",
"metadata": {},
"source": [
"**f) When a deep learning architecture is trained using an unbalanced training set, we usually observe a problem of bias, i.e. the system favors one class over another one. Using the Bayes equation, explain what is the origin of the problem.**"
]
},
{
"cell_type": "markdown",
"id": "fa5ffd45-0645-4093-9a1b-0a7aeaeece0e",
"metadata": {},
"source": [
"The bayes equation : P(Ck|x) = (p(x|Ck)*P(Ck))/p(x).\n",
"\n",
"The a priori probability (P(Ck)) is what reprensents the unbalance in the training set. This value is the probability to have this class, so it is linked to the number of data in it. This means that a class with 3x more data in it will have a way bigger P(Ck) witch will impact the decision in favor of the biggest P(Ck)."
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

202
PW-4/lausanne-appart.csv Normal file
View File

@@ -0,0 +1,202 @@
living_area,nb_rooms,rent_price
69,3,1810
95,3.5,2945
21,1.5,685
20,1,720
33,1.5,830
13,1,850
17,1,850
27,1,855
32,1,875
26,1.5,890
25,1,890
31,1,900
18,1,900
24,1,900
25,1,920
25,1,930
27,1,950
37,2,955
28,1,960
39,1,970
31,1,980
25,1,980
25,1,980
29,1.5,1000
38,1,1015
16,1,1040
20,1,1060
50,2,1100
37,1.5,1130
33,2,1160
40,1,1200
46,1,1210
45,2,1235
19,1,1250
45,2,1310
56,2,1315
60,2.5,1320
23,1,1350
49,2,1370
51,1.5,1385
48,2,1390
51,1,1400
41,2,1400
47,2,1410
45,2,1410
47,2,1415
37,1.5,1420
52,2,1445
45,2,1450
43,2,1450
26,1.5,1470
49,2,1480
49,2.5,1490
39,1,1530
60,2,1530
65,2.5,1580
60,3,1590
60,3,1590
47,1,1595
57,2,1630
65,2,1640
33,1,1650
56,2.5,1660
69,3,1690
61,2.5,1690
60,3,1690
47,1.5,1700
49,2,1700
60,2.5,1700
72,3,1715
70,2.5,1730
59,2,1750
30,1,1750
39,2,1750
25,1,1780
68,3,1790
63,3,1790
78,2,1790
50,2.5,1800
70,3,1810
75,2.5,1830
60,3,1840
70,3.5,1840
28,1,1850
62,2.5,1860
90,3,1870
78,3.5,1920
80,2,1930
72,3,1940
78,3.5,1950
62,2.5,1980
80,3.5,1990
80,3.5,2000
75,4,2030
68,3,2040
76,3.5,2060
81,3,2080
92,3,2085
75,3,2090
82,3.5,2130
80,3.5,2130
95,4,2145
85,4.5,2160
58,2.5,2160
33,1.5,2170
94,4.5,2190
100,3.5,2250
77,3.5,2270
80,3.5,2270
80,3.5,2290
92,3.5,2320
92,3.5,2335
99,4.5,2335
98,3,2358
90,4.5,2360
96,3,2380
92,4.5,2380
86,4.5,2390
73,3,2400
80,3.5,2400
96,3,2403
72,3.5,2410
91,4,2420
53,2.5,2425
60,2.5,2490
95,3.5,2500
97,4.5,2530
103,4.5,2550
105,5,2550
112,3.5,2550
110,4,2560
107,5,2570
65,1.5,2570
97,4.5,2590
110,4.5,2625
102,4.5,2635
101,4.5,2675
98,3.5,2700
109,4.5,2710
120,5,2710
107,4,2720
125,3.5,2725
120,4.5,2750
108,5,2805
130,5.5,2820
112,4.5,2825
121,4.5,2830
118,4.5,2840
107,4.5,2840
87,3.5,2850
114,4,2850
110,4.5,2870
118,4.5,2875
126,5.5,2900
112,4.5,2915
93,3,2945
80,3.5,2950
116,4.5,3050
145,4,3050
95,4.5,3080
100,3.5,3090
98,3.5,3090
94,4.5,3100
110,4.5,3150
100,4.5,3160
110,2,3180
109,4.5,3220
131,5,3300
133,5,3300
86,2.5,3350
84,3,3400
145,4,3450
113,4.5,3490
130,3.5,3500
108,3,3525
94,4.5,3570
136,4.5,3765
140,4.5,3765
125,3.5,3790
156,5.5,3930
91,3.5,3950
130,4.5,3965
102,4.5,4061
130,4.5,4200
142,5.5,4260
88,3.5,4310
178,6.5,4760
150,4.5,4800
185,5.5,4900
164,6.5,5160
214,5.5,5200
191,6,5229
156,5.5,5250
145,5,5383
175,5,5460
150,4.5,5500
129,5.5,5560
160,4,5775
201,5.5,6200
240,6.5,6700
145,5,7383
1 living_area nb_rooms rent_price
2 69 3 1810
3 95 3.5 2945
4 21 1.5 685
5 20 1 720
6 33 1.5 830
7 13 1 850
8 17 1 850
9 27 1 855
10 32 1 875
11 26 1.5 890
12 25 1 890
13 31 1 900
14 18 1 900
15 24 1 900
16 25 1 920
17 25 1 930
18 27 1 950
19 37 2 955
20 28 1 960
21 39 1 970
22 31 1 980
23 25 1 980
24 25 1 980
25 29 1.5 1000
26 38 1 1015
27 16 1 1040
28 20 1 1060
29 50 2 1100
30 37 1.5 1130
31 33 2 1160
32 40 1 1200
33 46 1 1210
34 45 2 1235
35 19 1 1250
36 45 2 1310
37 56 2 1315
38 60 2.5 1320
39 23 1 1350
40 49 2 1370
41 51 1.5 1385
42 48 2 1390
43 51 1 1400
44 41 2 1400
45 47 2 1410
46 45 2 1410
47 47 2 1415
48 37 1.5 1420
49 52 2 1445
50 45 2 1450
51 43 2 1450
52 26 1.5 1470
53 49 2 1480
54 49 2.5 1490
55 39 1 1530
56 60 2 1530
57 65 2.5 1580
58 60 3 1590
59 60 3 1590
60 47 1 1595
61 57 2 1630
62 65 2 1640
63 33 1 1650
64 56 2.5 1660
65 69 3 1690
66 61 2.5 1690
67 60 3 1690
68 47 1.5 1700
69 49 2 1700
70 60 2.5 1700
71 72 3 1715
72 70 2.5 1730
73 59 2 1750
74 30 1 1750
75 39 2 1750
76 25 1 1780
77 68 3 1790
78 63 3 1790
79 78 2 1790
80 50 2.5 1800
81 70 3 1810
82 75 2.5 1830
83 60 3 1840
84 70 3.5 1840
85 28 1 1850
86 62 2.5 1860
87 90 3 1870
88 78 3.5 1920
89 80 2 1930
90 72 3 1940
91 78 3.5 1950
92 62 2.5 1980
93 80 3.5 1990
94 80 3.5 2000
95 75 4 2030
96 68 3 2040
97 76 3.5 2060
98 81 3 2080
99 92 3 2085
100 75 3 2090
101 82 3.5 2130
102 80 3.5 2130
103 95 4 2145
104 85 4.5 2160
105 58 2.5 2160
106 33 1.5 2170
107 94 4.5 2190
108 100 3.5 2250
109 77 3.5 2270
110 80 3.5 2270
111 80 3.5 2290
112 92 3.5 2320
113 92 3.5 2335
114 99 4.5 2335
115 98 3 2358
116 90 4.5 2360
117 96 3 2380
118 92 4.5 2380
119 86 4.5 2390
120 73 3 2400
121 80 3.5 2400
122 96 3 2403
123 72 3.5 2410
124 91 4 2420
125 53 2.5 2425
126 60 2.5 2490
127 95 3.5 2500
128 97 4.5 2530
129 103 4.5 2550
130 105 5 2550
131 112 3.5 2550
132 110 4 2560
133 107 5 2570
134 65 1.5 2570
135 97 4.5 2590
136 110 4.5 2625
137 102 4.5 2635
138 101 4.5 2675
139 98 3.5 2700
140 109 4.5 2710
141 120 5 2710
142 107 4 2720
143 125 3.5 2725
144 120 4.5 2750
145 108 5 2805
146 130 5.5 2820
147 112 4.5 2825
148 121 4.5 2830
149 118 4.5 2840
150 107 4.5 2840
151 87 3.5 2850
152 114 4 2850
153 110 4.5 2870
154 118 4.5 2875
155 126 5.5 2900
156 112 4.5 2915
157 93 3 2945
158 80 3.5 2950
159 116 4.5 3050
160 145 4 3050
161 95 4.5 3080
162 100 3.5 3090
163 98 3.5 3090
164 94 4.5 3100
165 110 4.5 3150
166 100 4.5 3160
167 110 2 3180
168 109 4.5 3220
169 131 5 3300
170 133 5 3300
171 86 2.5 3350
172 84 3 3400
173 145 4 3450
174 113 4.5 3490
175 130 3.5 3500
176 108 3 3525
177 94 4.5 3570
178 136 4.5 3765
179 140 4.5 3765
180 125 3.5 3790
181 156 5.5 3930
182 91 3.5 3950
183 130 4.5 3965
184 102 4.5 4061
185 130 4.5 4200
186 142 5.5 4260
187 88 3.5 4310
188 178 6.5 4760
189 150 4.5 4800
190 185 5.5 4900
191 164 6.5 5160
192 214 5.5 5200
193 191 6 5229
194 156 5.5 5250
195 145 5 5383
196 175 5 5460
197 150 4.5 5500
198 129 5.5 5560
199 160 4 5775
200 201 5.5 6200
201 240 6.5 6700
202 145 5 7383

File diff suppressed because one or more lines are too long