Started PW-2
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
MLvenv/**
|
||||
1053
PW-2/ex1-numpy/numpy-tutorial-stud.ipynb
Normal file
1053
PW-2/ex1-numpy/numpy-tutorial-stud.ipynb
Normal file
File diff suppressed because one or more lines are too long
BIN
PW-2/ex1-numpy/sponge-bob.jpg
Normal file
BIN
PW-2/ex1-numpy/sponge-bob.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 59 KiB |
481
PW-2/ex2-to-loan-or-not-to-loan/loandata.csv
Normal file
481
PW-2/ex2-to-loan-or-not-to-loan/loandata.csv
Normal file
@@ -0,0 +1,481 @@
|
||||
Gender,Married,Education,TotalIncome,LoanAmount,CreditHistory,LoanStatus
|
||||
Male,Yes,Graduate,6091.0,128.0,1.0,N
|
||||
Male,Yes,Graduate,3000.0,66.0,1.0,Y
|
||||
Male,Yes,Not Graduate,4941.0,120.0,1.0,Y
|
||||
Male,No,Graduate,6000.0,141.0,1.0,Y
|
||||
Male,Yes,Graduate,9613.0,267.0,1.0,Y
|
||||
Male,Yes,Not Graduate,3849.0,95.0,1.0,Y
|
||||
Male,Yes,Graduate,5540.0,158.0,0.0,N
|
||||
Male,Yes,Graduate,5532.0,168.0,1.0,Y
|
||||
Male,Yes,Graduate,23809.0,349.0,1.0,N
|
||||
Male,Yes,Graduate,3900.0,70.0,1.0,Y
|
||||
Male,Yes,Graduate,11179.0,200.0,1.0,Y
|
||||
Male,No,Graduate,4693.0,114.0,1.0,N
|
||||
Male,Yes,Graduate,2385.0,17.0,1.0,Y
|
||||
Male,No,Graduate,4950.0,125.0,1.0,Y
|
||||
Female,No,Graduate,3510.0,76.0,0.0,N
|
||||
Male,Yes,Not Graduate,4887.0,133.0,1.0,N
|
||||
Male,Yes,Not Graduate,7660.0,104.0,0.0,N
|
||||
Male,Yes,Graduate,11580.0,315.0,1.0,Y
|
||||
Male,Yes,Not Graduate,4511.0,116.0,0.0,N
|
||||
Male,Yes,Graduate,9560.0,191.0,1.0,Y
|
||||
Male,Yes,Graduate,5052.0,122.0,1.0,Y
|
||||
Male,Yes,Not Graduate,5266.0,110.0,1.0,Y
|
||||
Male,No,Not Graduate,1442.0,35.0,1.0,N
|
||||
Male,No,Graduate,3167.0,74.0,1.0,N
|
||||
Male,No,Graduate,4692.0,106.0,1.0,N
|
||||
Male,Yes,Graduate,5167.0,114.0,1.0,Y
|
||||
Male,No,Graduate,15500.0,320.0,1.0,N
|
||||
Female,Yes,Graduate,5126.0,144.0,1.0,Y
|
||||
Male,No,Graduate,11376.0,184.0,1.0,Y
|
||||
Male,No,Not Graduate,5416.0,110.0,1.0,Y
|
||||
Male,No,Graduate,3600.0,80.0,1.0,N
|
||||
Male,No,Graduate,3013.0,47.0,1.0,Y
|
||||
Male,Yes,Graduate,6277.0,134.0,1.0,Y
|
||||
Male,Yes,Graduate,5649.0,44.0,1.0,Y
|
||||
Male,Yes,Graduate,5821.0,144.0,1.0,Y
|
||||
Female,Yes,Graduate,6085.0,120.0,0.0,N
|
||||
Female,No,Graduate,6275.0,144.0,1.0,Y
|
||||
Female,Yes,Not Graduate,3572.0,100.0,1.0,Y
|
||||
Female,No,Graduate,3086.0,120.0,1.0,Y
|
||||
Female,No,Graduate,4230.0,112.0,1.0,N
|
||||
Male,Yes,Graduate,4616.0,134.0,1.0,N
|
||||
Female,Yes,Graduate,11500.0,286.0,0.0,N
|
||||
Male,Yes,Graduate,3875.0,97.0,1.0,Y
|
||||
Male,Yes,Graduate,3723.0,96.0,1.0,Y
|
||||
Male,Yes,Graduate,5566.0,135.0,1.0,N
|
||||
Male,Yes,Graduate,10330.0,180.0,1.0,Y
|
||||
Male,Yes,Not Graduate,6216.0,144.0,1.0,Y
|
||||
Male,Yes,Graduate,6296.0,120.0,1.0,Y
|
||||
Male,Yes,Graduate,3029.0,99.0,1.0,Y
|
||||
Male,Yes,Not Graduate,6058.0,165.0,0.0,N
|
||||
Female,No,Graduate,4166.0,116.0,0.0,N
|
||||
Male,Yes,Graduate,10321.0,258.0,1.0,N
|
||||
Male,No,Not Graduate,5454.0,126.0,0.0,N
|
||||
Male,Yes,Graduate,10750.0,312.0,1.0,Y
|
||||
Male,Yes,Not Graduate,7100.0,125.0,1.0,Y
|
||||
Female,No,Graduate,4300.0,136.0,0.0,N
|
||||
Male,Yes,Graduate,6274.0,172.0,1.0,Y
|
||||
Male,Yes,Not Graduate,3750.0,97.0,1.0,Y
|
||||
Male,No,Graduate,3500.0,81.0,1.0,Y
|
||||
Male,Yes,Graduate,7040.0,187.0,1.0,Y
|
||||
Male,No,Graduate,3750.0,113.0,1.0,N
|
||||
Male,No,Graduate,8500.0,176.0,1.0,N
|
||||
Male,Yes,Graduate,4022.0,110.0,1.0,N
|
||||
Male,Yes,Graduate,7167.0,180.0,0.0,N
|
||||
Female,No,Graduate,3846.0,111.0,1.0,Y
|
||||
Female,Yes,Graduate,3259.0,167.0,1.0,N
|
||||
Male,Yes,Graduate,3988.0,50.0,1.0,Y
|
||||
Male,No,Graduate,4897.0,136.0,1.0,Y
|
||||
Male,Yes,Graduate,4618.0,104.0,1.0,Y
|
||||
Male,No,Graduate,8566.0,210.0,1.0,Y
|
||||
Male,Yes,Graduate,9862.0,175.0,1.0,Y
|
||||
Male,Yes,Graduate,5858.0,131.0,1.0,Y
|
||||
Male,Yes,Graduate,11904.0,188.0,1.0,Y
|
||||
Male,Yes,Not Graduate,5093.0,81.0,1.0,Y
|
||||
Male,No,Graduate,4133.0,122.0,1.0,Y
|
||||
Male,No,Not Graduate,3620.0,25.0,1.0,Y
|
||||
Female,Yes,Graduate,4786.0,137.0,1.0,Y
|
||||
Male,Yes,Graduate,2974.0,50.0,1.0,Y
|
||||
Male,Yes,Not Graduate,4188.0,115.0,1.0,Y
|
||||
Male,Yes,Graduate,5300.0,131.0,1.0,Y
|
||||
Male,Yes,Not Graduate,7551.0,133.0,1.0,Y
|
||||
Male,No,Graduate,8649.0,151.0,1.0,Y
|
||||
Male,Yes,Graduate,4082.0,100.0,1.0,Y
|
||||
Male,Yes,Graduate,12543.0,225.0,1.0,Y
|
||||
Male,Yes,Graduate,7400.0,216.0,0.0,N
|
||||
Male,Yes,Not Graduate,2825.0,94.0,1.0,Y
|
||||
Male,No,Graduate,5316.0,136.0,1.0,Y
|
||||
Male,Yes,Graduate,14583.0,185.0,1.0,Y
|
||||
Female,Yes,Graduate,5450.0,154.0,1.0,Y
|
||||
Male,Yes,Graduate,7710.0,175.0,1.0,N
|
||||
Female,No,Graduate,10408.0,259.0,1.0,Y
|
||||
Female,No,Graduate,4166.0,44.0,1.0,Y
|
||||
Female,No,Graduate,11117.0,137.0,0.0,Y
|
||||
Male,Yes,Graduate,2957.0,81.0,1.0,Y
|
||||
Male,Yes,Not Graduate,6314.0,194.0,1.0,Y
|
||||
Male,Yes,Graduate,14363.0,160.0,0.0,N
|
||||
Male,No,Graduate,3943.0,74.0,1.0,Y
|
||||
Male,No,Graduate,2718.0,70.0,1.0,Y
|
||||
Male,Yes,Graduate,3459.0,25.0,1.0,Y
|
||||
Male,No,Graduate,4895.0,102.0,1.0,Y
|
||||
Male,Yes,Graduate,11750.0,290.0,1.0,N
|
||||
Female,Yes,Graduate,4583.0,84.0,1.0,N
|
||||
Male,Yes,Graduate,6816.0,88.0,1.0,Y
|
||||
Male,No,Graduate,14999.0,242.0,0.0,N
|
||||
Male,Yes,Not Graduate,5630.0,129.0,1.0,N
|
||||
Male,Yes,Graduate,7125.0,185.0,1.0,N
|
||||
Male,No,Graduate,5417.0,168.0,1.0,Y
|
||||
Male,No,Graduate,6950.0,175.0,1.0,Y
|
||||
Male,Yes,Graduate,4732.0,122.0,1.0,Y
|
||||
Male,Yes,Graduate,11757.0,187.0,1.0,Y
|
||||
Female,Yes,Graduate,6816.0,100.0,1.0,Y
|
||||
Female,Yes,Graduate,14866.0,70.0,1.0,Y
|
||||
Male,Yes,Graduate,2963.0,30.0,1.0,Y
|
||||
Female,No,Graduate,11666.0,225.0,1.0,N
|
||||
Male,Yes,Graduate,5690.0,125.0,1.0,Y
|
||||
Male,No,Graduate,6277.0,118.0,0.0,N
|
||||
Male,Yes,Graduate,6327.0,152.0,1.0,Y
|
||||
Male,No,Graduate,9166.0,244.0,1.0,N
|
||||
Male,Yes,Not Graduate,2281.0,113.0,1.0,N
|
||||
Male,No,Graduate,3254.0,50.0,1.0,Y
|
||||
Male,Yes,Graduate,39999.0,600.0,0.0,Y
|
||||
Male,Yes,Graduate,9538.0,187.0,1.0,Y
|
||||
Male,Yes,Graduate,10208.0,255.0,1.0,Y
|
||||
Male,Yes,Not Graduate,2904.0,98.0,1.0,Y
|
||||
Male,Yes,Graduate,7933.0,275.0,1.0,N
|
||||
Male,Yes,Graduate,4369.0,121.0,0.0,N
|
||||
Male,Yes,Graduate,5614.0,158.0,1.0,Y
|
||||
Male,Yes,Graduate,9323.0,75.0,1.0,Y
|
||||
Female,Yes,Graduate,4583.0,112.0,1.0,N
|
||||
Male,Yes,Graduate,5772.0,129.0,1.0,Y
|
||||
Male,No,Graduate,2237.0,63.0,0.0,N
|
||||
Male,Yes,Graduate,8000.0,200.0,1.0,Y
|
||||
Male,Yes,Not Graduate,3522.0,81.0,1.0,N
|
||||
Male,Yes,Graduate,11333.0,187.0,1.0,Y
|
||||
Male,Yes,Not Graduate,5080.0,87.0,1.0,N
|
||||
Male,Yes,Graduate,5461.0,116.0,1.0,Y
|
||||
Male,Yes,Graduate,3664.0,101.0,1.0,Y
|
||||
Male,Yes,Graduate,16816.0,495.0,0.0,N
|
||||
Male,Yes,Graduate,3750.0,116.0,1.0,Y
|
||||
Male,No,Not Graduate,3784.0,102.0,0.0,N
|
||||
Male,Yes,Graduate,13650.0,180.0,0.0,N
|
||||
Male,Yes,Graduate,4600.0,73.0,1.0,Y
|
||||
Male,Yes,Graduate,33846.0,260.0,1.0,N
|
||||
Female,Yes,Graduate,3625.0,108.0,1.0,Y
|
||||
Male,Yes,Graduate,43897.0,120.0,1.0,Y
|
||||
Male,Yes,Graduate,2178.0,66.0,0.0,N
|
||||
Male,Yes,Graduate,9328.0,188.0,1.0,Y
|
||||
Male,No,Not Graduate,4885.0,48.0,1.0,Y
|
||||
Male,No,Graduate,12000.0,164.0,1.0,N
|
||||
Male,Yes,Not Graduate,6033.0,160.0,1.0,N
|
||||
Male,No,Graduate,3858.0,76.0,1.0,Y
|
||||
Male,No,Graduate,4191.0,120.0,1.0,Y
|
||||
Male,Yes,Graduate,5708.0,170.0,1.0,N
|
||||
Male,No,Graduate,12083.0,187.0,1.0,Y
|
||||
Male,No,Graduate,11000.0,83.0,1.0,N
|
||||
Male,Yes,Not Graduate,5100.0,90.0,1.0,Y
|
||||
Male,No,Graduate,4923.0,166.0,0.0,Y
|
||||
Male,Yes,Not Graduate,4583.0,135.0,1.0,Y
|
||||
Male,Yes,Not Graduate,3917.0,124.0,1.0,Y
|
||||
Female,No,Not Graduate,4408.0,120.0,1.0,Y
|
||||
Female,No,Graduate,3244.0,80.0,1.0,Y
|
||||
Male,No,Not Graduate,6506.0,55.0,1.0,Y
|
||||
Male,No,Graduate,2479.0,59.0,1.0,Y
|
||||
Male,No,Graduate,3418.0,127.0,1.0,N
|
||||
Female,No,Graduate,10000.0,214.0,1.0,N
|
||||
Male,Yes,Graduate,4680.0,128.0,0.0,N
|
||||
Male,Yes,Graduate,7787.0,240.0,1.0,Y
|
||||
Male,Yes,Not Graduate,5703.0,130.0,1.0,Y
|
||||
Male,Yes,Graduate,6194.0,137.0,1.0,Y
|
||||
Male,Yes,Not Graduate,4833.0,100.0,1.0,Y
|
||||
Male,Yes,Graduate,1950.0,135.0,1.0,N
|
||||
Male,Yes,Graduate,5502.0,131.0,1.0,Y
|
||||
Male,Yes,Graduate,2221.0,60.0,0.0,N
|
||||
Male,Yes,Graduate,5726.0,116.0,1.0,Y
|
||||
Male,No,Graduate,5762.0,144.0,1.0,Y
|
||||
Male,Yes,Graduate,6250.0,128.0,1.0,Y
|
||||
Male,Yes,Graduate,3250.0,170.0,1.0,N
|
||||
Male,Yes,Graduate,7945.0,210.0,1.0,Y
|
||||
Male,No,Graduate,6400.0,200.0,1.0,Y
|
||||
Male,Yes,Graduate,4545.0,104.0,1.0,Y
|
||||
Female,No,Graduate,8333.0,280.0,1.0,Y
|
||||
Male,Yes,Graduate,4934.0,140.0,1.0,Y
|
||||
Male,Yes,Graduate,6760.0,170.0,1.0,Y
|
||||
Female,No,Graduate,3812.0,112.0,1.0,Y
|
||||
Male,Yes,Graduate,3315.0,96.0,1.0,Y
|
||||
Male,Yes,Graduate,10819.0,120.0,1.0,Y
|
||||
Male,Yes,Not Graduate,4493.0,140.0,1.0,N
|
||||
Male,No,Graduate,8666.0,155.0,1.0,Y
|
||||
Male,Yes,Graduate,7550.0,108.0,1.0,Y
|
||||
Male,Yes,Not Graduate,7823.0,123.0,1.0,Y
|
||||
Male,No,Graduate,10383.0,120.0,1.0,N
|
||||
Male,Yes,Graduate,9703.0,112.0,1.0,Y
|
||||
Male,Yes,Not Graduate,6608.0,137.0,1.0,Y
|
||||
Male,Yes,Graduate,4725.0,123.0,1.0,Y
|
||||
Male,Yes,Graduate,3677.0,90.0,1.0,Y
|
||||
Male,Yes,Not Graduate,5558.0,201.0,0.0,N
|
||||
Female,No,Graduate,3427.0,138.0,1.0,N
|
||||
Male,No,Not Graduate,4750.0,104.0,1.0,Y
|
||||
Male,Yes,Not Graduate,9762.0,279.0,1.0,Y
|
||||
Male,No,Graduate,16250.0,192.0,0.0,N
|
||||
Female,No,Graduate,3083.0,255.0,1.0,Y
|
||||
Male,No,Not Graduate,6045.0,115.0,0.0,N
|
||||
Male,Yes,Graduate,5250.0,94.0,1.0,N
|
||||
Male,Yes,Graduate,16783.0,304.0,1.0,N
|
||||
Male,No,Graduate,4269.0,134.0,1.0,Y
|
||||
Female,No,Graduate,3481.0,155.0,1.0,N
|
||||
Female,No,Graduate,7200.0,120.0,1.0,Y
|
||||
Male,No,Graduate,5166.0,128.0,1.0,Y
|
||||
Male,No,Graduate,7542.0,151.0,1.0,Y
|
||||
Male,Yes,Graduate,6095.0,150.0,1.0,Y
|
||||
Male,Yes,Graduate,6144.0,160.0,0.0,Y
|
||||
Female,No,Graduate,4436.0,90.0,1.0,Y
|
||||
Female,No,Graduate,3237.0,30.0,1.0,Y
|
||||
Male,Yes,Graduate,11146.0,136.0,1.0,Y
|
||||
Male,No,Graduate,4690.0,126.0,1.0,Y
|
||||
Male,Yes,Graduate,4843.0,150.0,1.0,Y
|
||||
Male,Yes,Graduate,3900.0,90.0,1.0,Y
|
||||
Male,Yes,Graduate,4592.0,115.0,1.0,Y
|
||||
Male,Yes,Graduate,7267.0,207.0,1.0,Y
|
||||
Male,Yes,Graduate,4403.0,80.0,1.0,Y
|
||||
Male,Yes,Graduate,14583.0,436.0,1.0,Y
|
||||
Male,No,Not Graduate,6479.0,158.0,0.0,N
|
||||
Male,Yes,Graduate,4727.0,112.0,1.0,Y
|
||||
Male,Yes,Graduate,3286.7999878,78.0,1.0,Y
|
||||
Female,No,Graduate,3477.0,54.0,1.0,Y
|
||||
Male,No,Graduate,6211.0,89.0,1.0,Y
|
||||
Female,No,Graduate,4317.0,99.0,1.0,N
|
||||
Male,Yes,Graduate,5704.0,120.0,1.0,Y
|
||||
Female,No,Graduate,4124.0,115.0,1.0,Y
|
||||
Male,No,Graduate,9508.0,187.0,1.0,Y
|
||||
Male,Yes,Graduate,5491.0,139.0,1.0,Y
|
||||
Male,Yes,Graduate,4400.0,127.0,0.0,N
|
||||
Male,Yes,Graduate,4713.0,134.0,1.0,Y
|
||||
Male,Yes,Graduate,5717.0,172.0,1.0,Y
|
||||
Male,Yes,Graduate,6875.0,200.0,1.0,Y
|
||||
Female,Yes,Graduate,4666.0,135.0,1.0,Y
|
||||
Female,No,Graduate,7541.0,151.0,1.0,N
|
||||
Male,Yes,Graduate,4939.0,113.0,1.0,N
|
||||
Male,Yes,Not Graduate,4734.0,93.0,0.0,N
|
||||
Female,No,Graduate,5000.0,132.0,1.0,Y
|
||||
Male,Yes,Graduate,3428.0,96.0,1.0,Y
|
||||
Male,No,Graduate,6500.0,140.0,1.0,Y
|
||||
Female,No,Graduate,5428.0,135.0,1.0,Y
|
||||
Female,No,Graduate,4263.0,104.0,0.0,N
|
||||
Male,No,Graduate,20233.0,480.0,1.0,N
|
||||
Female,No,Graduate,2917.0,84.0,1.0,Y
|
||||
Male,No,Not Graduate,5332.0,111.0,1.0,Y
|
||||
Female,No,Graduate,2507.0,56.0,1.0,Y
|
||||
Male,Yes,Not Graduate,5039.0,111.0,1.0,Y
|
||||
Male,Yes,Graduate,3717.0,120.0,1.0,Y
|
||||
Male,Yes,Graduate,10000.0,155.0,1.0,N
|
||||
Male,Yes,Graduate,4567.0,115.0,1.0,Y
|
||||
Male,Yes,Graduate,4531.0,124.0,1.0,Y
|
||||
Male,Yes,Graduate,15000.0,300.0,1.0,Y
|
||||
Male,Yes,Graduate,13649.0,376.0,0.0,N
|
||||
Male,No,Graduate,4917.0,130.0,0.0,Y
|
||||
Male,Yes,Graduate,7978.0,184.0,1.0,Y
|
||||
Female,Yes,Graduate,6784.0,110.0,1.0,N
|
||||
Female,No,Graduate,2500.0,67.0,1.0,Y
|
||||
Male,No,Graduate,6177.0,117.0,1.0,Y
|
||||
Male,No,Graduate,2935.0,98.0,1.0,Y
|
||||
Male,Yes,Graduate,7100.0,176.0,1.0,Y
|
||||
Female,No,Graduate,4160.0,71.0,1.0,Y
|
||||
Male,Yes,Not Graduate,4234.0,173.0,1.0,N
|
||||
Female,No,Graduate,2378.0,46.0,1.0,N
|
||||
Male,Yes,Not Graduate,5783.0,158.0,1.0,Y
|
||||
Male,Yes,Not Graduate,3173.0,74.0,1.0,Y
|
||||
Male,Yes,Graduate,4957.0,160.0,1.0,Y
|
||||
Male,Yes,Not Graduate,5251.0,126.0,1.0,Y
|
||||
Male,Yes,Graduate,8875.0,187.0,1.0,Y
|
||||
Male,Yes,Graduate,9083.0,228.0,1.0,Y
|
||||
Male,No,Graduate,12917.0,308.0,1.0,N
|
||||
Male,Yes,Graduate,4749.0,95.0,1.0,Y
|
||||
Female,Yes,Graduate,5500.0,105.0,0.0,N
|
||||
Female,Yes,Graduate,2928.0,130.0,1.0,Y
|
||||
Male,Yes,Graduate,11500.0,165.0,1.0,Y
|
||||
Male,Yes,Graduate,3875.0,67.0,1.0,N
|
||||
Male,Yes,Not Graduate,4666.0,100.0,0.0,N
|
||||
Male,Yes,Graduate,8334.0,200.0,1.0,Y
|
||||
Female,No,Graduate,4723.0,81.0,1.0,N
|
||||
Male,Yes,Graduate,8667.0,236.0,1.0,Y
|
||||
Male,Yes,Graduate,7083.0,130.0,1.0,Y
|
||||
Male,No,Graduate,6822.0,141.0,1.0,Y
|
||||
Male,No,Not Graduate,6216.0,133.0,1.0,N
|
||||
Male,No,Graduate,2500.0,96.0,1.0,N
|
||||
Male,Yes,Graduate,6325.0,175.0,1.0,Y
|
||||
Male,Yes,Graduate,24996.0,570.0,1.0,N
|
||||
Female,No,Graduate,15759.0,55.0,1.0,Y
|
||||
Male,Yes,Graduate,5185.0,155.0,1.0,Y
|
||||
Male,Yes,Graduate,17196.0,380.0,1.0,Y
|
||||
Male,No,Graduate,5049.0,111.0,0.0,N
|
||||
Male,Yes,Graduate,5740.0,120.0,1.0,Y
|
||||
Male,Yes,Graduate,13746.0,130.0,1.0,Y
|
||||
Male,No,Graduate,3069.0,71.0,1.0,N
|
||||
Male,Yes,Graduate,5391.0,130.0,1.0,Y
|
||||
Male,No,Graduate,10173.0,296.0,1.0,Y
|
||||
Female,No,Graduate,6000.0,156.0,1.0,Y
|
||||
Male,No,Graduate,7167.0,128.0,1.0,Y
|
||||
Male,Yes,Graduate,4566.0,100.0,1.0,N
|
||||
Male,No,Not Graduate,3946.0,132.0,1.0,Y
|
||||
Male,Yes,Graduate,4750.0,136.0,1.0,Y
|
||||
Male,Yes,Graduate,5488.0,125.0,1.0,Y
|
||||
Male,No,Graduate,9167.0,185.0,1.0,Y
|
||||
Male,Yes,Graduate,9504.0,275.0,1.0,Y
|
||||
Male,Yes,Not Graduate,3618.0,113.0,1.0,Y
|
||||
Male,Yes,Graduate,4500.0,113.0,1.0,Y
|
||||
Female,No,Graduate,3180.0,71.0,0.0,N
|
||||
Male,Yes,Graduate,4492.0,95.0,1.0,Y
|
||||
Male,No,Not Graduate,5568.0,109.0,1.0,Y
|
||||
Female,No,Graduate,3300.0,103.0,0.0,N
|
||||
Male,Yes,Not Graduate,2889.0,45.0,0.0,N
|
||||
Male,No,Not Graduate,2755.0,65.0,1.0,N
|
||||
Male,No,Graduate,22500.0,103.0,1.0,Y
|
||||
Female,No,Not Graduate,1963.0,53.0,1.0,Y
|
||||
Female,No,Graduate,7441.0,194.0,1.0,N
|
||||
Female,No,Graduate,4547.0,115.0,1.0,Y
|
||||
Male,Yes,Not Graduate,4567.0,115.0,1.0,Y
|
||||
Female,No,Not Graduate,2213.0,66.0,1.0,Y
|
||||
Male,Yes,Graduate,8300.0,152.0,0.0,N
|
||||
Male,Yes,Graduate,81000.0,360.0,0.0,N
|
||||
Female,No,Not Graduate,3867.0,62.0,1.0,N
|
||||
Male,Yes,Not Graduate,6096.0,218.0,0.0,N
|
||||
Male,Yes,Not Graduate,4286.0,110.0,1.0,Y
|
||||
Female,Yes,Not Graduate,5386.0,178.0,0.0,N
|
||||
Female,No,Graduate,2995.0,60.0,1.0,Y
|
||||
Female,No,Graduate,2600.0,160.0,1.0,N
|
||||
Male,Yes,Graduate,21600.0,239.0,1.0,N
|
||||
Male,Yes,Graduate,3798.0,112.0,1.0,Y
|
||||
Male,Yes,Graduate,4663.0,138.0,1.0,Y
|
||||
Male,Yes,Graduate,5829.0,138.0,1.0,Y
|
||||
Male,Yes,Graduate,3539.0,100.0,1.0,Y
|
||||
Male,Yes,Graduate,14880.0,96.0,1.0,Y
|
||||
Male,Yes,Graduate,6966.0,121.0,1.0,Y
|
||||
Female,No,Not Graduate,4606.0,81.0,1.0,N
|
||||
Male,Yes,Graduate,5935.0,133.0,1.0,Y
|
||||
Male,Yes,Graduate,2936.12000084,87.0,1.0,Y
|
||||
Male,No,Not Graduate,2717.0,60.0,1.0,Y
|
||||
Female,No,Graduate,8624.0,150.0,1.0,Y
|
||||
Male,No,Graduate,6500.0,105.0,0.0,N
|
||||
Male,Yes,Graduate,4765.0,143.0,1.0,Y
|
||||
Male,No,Graduate,3750.0,100.0,1.0,Y
|
||||
Male,No,Graduate,3777.0,50.0,1.0,Y
|
||||
Male,No,Graduate,10416.0,187.0,0.0,N
|
||||
Female,Yes,Not Graduate,7142.0,138.0,1.0,Y
|
||||
Male,No,Graduate,8724.0,187.0,1.0,Y
|
||||
Male,Yes,Graduate,9734.0,180.0,1.0,Y
|
||||
Male,No,Not Graduate,6700.0,148.0,1.0,Y
|
||||
Male,No,Graduate,37719.0,152.0,1.0,Y
|
||||
Male,Yes,Graduate,4676.0,130.0,1.0,Y
|
||||
Male,Yes,Not Graduate,4652.0,110.0,1.0,Y
|
||||
Male,Yes,Graduate,5050.0,150.0,0.0,N
|
||||
Male,Yes,Not Graduate,3564.0,125.0,0.0,N
|
||||
Male,Yes,Graduate,5681.0,149.0,0.0,N
|
||||
Male,Yes,Graduate,4949.0,90.0,0.0,Y
|
||||
Male,No,Graduate,7085.0,84.0,1.0,Y
|
||||
Male,Yes,Graduate,3859.0,96.0,1.0,Y
|
||||
Male,Yes,Graduate,4301.0,118.0,1.0,Y
|
||||
Male,Yes,Graduate,6277.0,173.0,1.0,N
|
||||
Male,No,Graduate,4354.0,136.0,1.0,Y
|
||||
Male,Yes,Graduate,8334.0,160.0,1.0,N
|
||||
Male,Yes,Graduate,7740.0,128.0,1.0,Y
|
||||
Male,Yes,Graduate,5203.0,153.0,1.0,Y
|
||||
Male,No,Graduate,4166.0,98.0,0.0,N
|
||||
Male,No,Graduate,6000.0,140.0,1.0,Y
|
||||
Male,Yes,Not Graduate,4611.0,70.0,0.0,N
|
||||
Male,Yes,Graduate,6784.0,110.0,1.0,N
|
||||
Male,Yes,Graduate,5529.0,162.0,1.0,Y
|
||||
Male,Yes,Not Graduate,4153.0,113.0,0.0,N
|
||||
Male,Yes,Graduate,4691.0,100.0,1.0,Y
|
||||
Male,No,Graduate,10180.0,162.0,1.0,Y
|
||||
Male,Yes,Graduate,17539.0,150.0,1.0,Y
|
||||
Male,Yes,Graduate,8450.0,230.0,1.0,Y
|
||||
Male,Yes,Graduate,18917.0,86.0,1.0,Y
|
||||
Female,No,Not Graduate,4350.0,154.0,1.0,Y
|
||||
Male,Yes,Not Graduate,3095.0,113.0,1.0,Y
|
||||
Male,Yes,Graduate,5233.0,128.0,1.0,Y
|
||||
Male,Yes,Graduate,10833.0,234.0,1.0,Y
|
||||
Male,Yes,Graduate,8333.0,246.0,1.0,Y
|
||||
Male,Yes,Not Graduate,4394.0,131.0,1.0,Y
|
||||
Male,No,Graduate,3547.0,80.0,0.0,N
|
||||
Male,Yes,Graduate,18333.0,500.0,1.0,N
|
||||
Male,Yes,Graduate,6666.0,160.0,1.0,Y
|
||||
Male,No,Graduate,2435.0,75.0,1.0,N
|
||||
Male,No,Not Graduate,3691.0,110.0,1.0,Y
|
||||
Female,No,Not Graduate,17263.0,225.0,1.0,Y
|
||||
Male,Yes,Graduate,5754.0,119.0,0.0,N
|
||||
Female,Yes,Graduate,4239.0,105.0,1.0,Y
|
||||
Male,Yes,Not Graduate,4300.0,107.0,1.0,Y
|
||||
Male,Yes,Graduate,2895.0,95.0,1.0,Y
|
||||
Male,No,Graduate,10699.0,209.0,0.0,N
|
||||
Female,No,Graduate,4328.0,113.0,1.0,Y
|
||||
Female,No,Graduate,3159.0,100.0,1.0,Y
|
||||
Male,Yes,Graduate,10489.0,208.0,1.0,Y
|
||||
Male,Yes,Not Graduate,5297.0,124.0,1.0,Y
|
||||
Male,Yes,Graduate,7926.0,243.0,1.0,Y
|
||||
Male,Yes,Graduate,5492.0,188.0,1.0,Y
|
||||
Female,No,Graduate,13262.0,40.0,1.0,Y
|
||||
Male,No,Not Graduate,4885.0,100.0,1.0,N
|
||||
Male,Yes,Graduate,8069.0,250.0,1.0,Y
|
||||
Male,Yes,Graduate,5318.0,148.0,1.0,Y
|
||||
Male,Yes,Graduate,8796.0,70.0,1.0,N
|
||||
Male,No,Graduate,9481.0,311.0,1.0,N
|
||||
Male,Yes,Graduate,6894.0,150.0,1.0,Y
|
||||
Female,Yes,Graduate,3663.0,113.0,1.0,Y
|
||||
Male,No,Graduate,6598.0,185.0,1.0,N
|
||||
Female,No,Not Graduate,3400.0,95.0,1.0,N
|
||||
Male,Yes,Not Graduate,3934.0,45.0,1.0,Y
|
||||
Male,No,Graduate,2500.0,55.0,1.0,Y
|
||||
Male,Yes,Graduate,7101.0,100.0,1.0,Y
|
||||
Male,Yes,Graduate,15114.0,480.0,1.0,Y
|
||||
Male,Yes,Graduate,17500.0,400.0,1.0,Y
|
||||
Male,Yes,Graduate,3775.0,110.0,1.0,Y
|
||||
Male,Yes,Not Graduate,6715.0,161.0,0.0,Y
|
||||
Male,No,Not Graduate,3981.0,94.0,1.0,Y
|
||||
Male,No,Not Graduate,6783.0,130.0,1.0,Y
|
||||
Male,Yes,Graduate,4281.0,100.0,1.0,Y
|
||||
Male,No,Graduate,3588.0,110.0,0.0,N
|
||||
Female,No,Not Graduate,18165.0,125.0,1.0,Y
|
||||
Male,Yes,Graduate,10039.0,324.0,1.0,Y
|
||||
Male,No,Graduate,3617.0,107.0,1.0,Y
|
||||
Male,Yes,Not Graduate,3453.0,66.0,1.0,N
|
||||
Male,Yes,Graduate,6417.0,157.0,1.0,Y
|
||||
Female,Yes,Graduate,7453.0,140.0,1.0,Y
|
||||
Female,No,Graduate,2138.0,99.0,0.0,N
|
||||
Male,Yes,Not Graduate,4763.0,128.0,1.0,Y
|
||||
Male,Yes,Graduate,4718.0,155.0,1.0,Y
|
||||
Male,No,Not Graduate,3358.0,80.0,1.0,N
|
||||
Male,No,Graduate,4309.0,145.0,1.0,Y
|
||||
Female,No,Graduate,5000.0,103.0,0.0,N
|
||||
Male,Yes,Graduate,4801.0,110.0,1.0,Y
|
||||
Male,Yes,Graduate,6583.0,158.0,1.0,Y
|
||||
Male,Yes,Not Graduate,4787.0,181.0,0.0,N
|
||||
Male,Yes,Graduate,7859.0,132.0,0.0,N
|
||||
Male,Yes,Graduate,6500.0,26.0,1.0,Y
|
||||
Male,Yes,Graduate,10139.0,260.0,1.0,Y
|
||||
Male,Yes,Graduate,6556.0,162.0,1.0,Y
|
||||
Female,Yes,Graduate,6486.0,182.0,1.0,Y
|
||||
Male,Yes,Not Graduate,3917.0,108.0,1.0,Y
|
||||
Female,Yes,Graduate,19484.0,600.0,1.0,Y
|
||||
Male,Yes,Graduate,7977.0,211.0,1.0,Y
|
||||
Male,No,Not Graduate,5800.0,132.0,1.0,Y
|
||||
Male,Yes,Graduate,8799.0,258.0,0.0,N
|
||||
Male,No,Graduate,3333.0,70.0,1.0,Y
|
||||
Male,Yes,Graduate,5900.0,123.0,0.0,N
|
||||
Female,No,Graduate,2378.0,9.0,1.0,N
|
||||
Male,Yes,Graduate,5230.0,104.0,0.0,N
|
||||
Male,Yes,Graduate,5167.0,186.0,1.0,Y
|
||||
Male,Yes,Graduate,16666.0,275.0,1.0,Y
|
||||
Male,Yes,Not Graduate,7750.0,187.0,1.0,N
|
||||
Male,Yes,Graduate,6406.0,150.0,1.0,N
|
||||
Male,Yes,Graduate,3620.0,108.0,1.0,Y
|
||||
Male,No,Graduate,5968.0,110.0,1.0,Y
|
||||
Male,Yes,Graduate,4014.0,107.0,1.0,Y
|
||||
Male,Yes,Graduate,6540.0,205.0,1.0,Y
|
||||
Male,No,Graduate,35673.0,90.0,1.0,N
|
||||
Female,Yes,Graduate,3166.0,36.0,1.0,Y
|
||||
Male,Yes,Graduate,4704.0,146.0,0.0,N
|
||||
Male,Yes,Graduate,7283.0,172.0,1.0,N
|
||||
Male,Yes,Graduate,3819.0,104.0,1.0,Y
|
||||
Female,No,Not Graduate,2165.0,70.0,1.0,Y
|
||||
Male,Yes,Graduate,2726.0,106.0,0.0,N
|
||||
Male,Yes,Graduate,6416.0,56.0,1.0,Y
|
||||
Male,Yes,Graduate,6000.0,205.0,1.0,N
|
||||
Male,Yes,Graduate,7159.0,142.0,1.0,Y
|
||||
Male,Yes,Graduate,16120.0,260.0,1.0,Y
|
||||
Male,No,Not Graduate,3833.0,110.0,1.0,Y
|
||||
Male,Yes,Not Graduate,7383.0,187.0,1.0,N
|
||||
Male,Yes,Graduate,9963.0,180.0,1.0,Y
|
||||
Male,Yes,Graduate,5780.0,192.0,1.0,Y
|
||||
Male,Yes,Graduate,5703.0,128.0,1.0,Y
|
||||
Male,No,Graduate,7977.0,172.0,1.0,Y
|
||||
Female,Yes,Graduate,12000.0,496.0,1.0,Y
|
||||
Male,Yes,Graduate,5900.0,173.0,1.0,Y
|
||||
Male,Yes,Not Graduate,5398.0,157.0,1.0,Y
|
||||
Male,Yes,Graduate,5182.0,108.0,1.0,Y
|
||||
Female,No,Graduate,2900.0,71.0,1.0,Y
|
||||
Male,Yes,Graduate,4106.0,40.0,1.0,Y
|
||||
Male,Yes,Graduate,8312.0,253.0,1.0,Y
|
||||
Male,Yes,Graduate,7583.0,187.0,1.0,Y
|
||||
Female,No,Graduate,4583.0,133.0,0.0,N
|
||||
|
@@ -0,0 +1,600 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5da8da61",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Exercice 2: Classification system with KNN - To Loan or Not To Loan"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9669e493",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Imports"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "22bbd869",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Import some useful libraries"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "26758936",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"from sklearn.preprocessing import OrdinalEncoder, StandardScaler\n",
|
||||
"from sklearn.model_selection import train_test_split"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "abc131ca",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## a. Getting started"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "45b518e5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Data loading"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1ef061f2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The original dataset comes from the Kaggle's [Loan Prediction](https://www.kaggle.com/ninzaami/loan-predication) problem. The provided dataset has already undergone some processing, such as removing some columns and invalid data. Pandas is used to read the CSV file."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "a23f62b5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = pd.read_csv(\"loandata.csv\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "02ca77c7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Display the head of the data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "f4bec500",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Gender</th>\n",
|
||||
" <th>Married</th>\n",
|
||||
" <th>Education</th>\n",
|
||||
" <th>TotalIncome</th>\n",
|
||||
" <th>LoanAmount</th>\n",
|
||||
" <th>CreditHistory</th>\n",
|
||||
" <th>LoanStatus</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>Graduate</td>\n",
|
||||
" <td>6091.0</td>\n",
|
||||
" <td>128.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>N</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>Graduate</td>\n",
|
||||
" <td>3000.0</td>\n",
|
||||
" <td>66.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>Y</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>Not Graduate</td>\n",
|
||||
" <td>4941.0</td>\n",
|
||||
" <td>120.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>Y</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>Graduate</td>\n",
|
||||
" <td>6000.0</td>\n",
|
||||
" <td>141.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>Y</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>Male</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>Graduate</td>\n",
|
||||
" <td>9613.0</td>\n",
|
||||
" <td>267.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>Y</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Gender Married Education TotalIncome LoanAmount CreditHistory \\\n",
|
||||
"0 Male Yes Graduate 6091.0 128.0 1.0 \n",
|
||||
"1 Male Yes Graduate 3000.0 66.0 1.0 \n",
|
||||
"2 Male Yes Not Graduate 4941.0 120.0 1.0 \n",
|
||||
"3 Male No Graduate 6000.0 141.0 1.0 \n",
|
||||
"4 Male Yes Graduate 9613.0 267.0 1.0 \n",
|
||||
"\n",
|
||||
" LoanStatus \n",
|
||||
"0 N \n",
|
||||
"1 Y \n",
|
||||
"2 Y \n",
|
||||
"3 Y \n",
|
||||
"4 Y "
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e271b475",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Data's columns:\n",
|
||||
"* **Gender:** Applicant gender (Male/ Female)\n",
|
||||
"* **Married:** Is the Applicant married? (Y/N)\n",
|
||||
"* **Education:** Applicant Education (Graduate/ Not Graduate)\n",
|
||||
"* **TotalIncome:** Applicant total income (sum of `ApplicantIncome` and `CoapplicantIncome` columns in the original dataset)\n",
|
||||
"* **LoanAmount:** Loan amount in thousands\n",
|
||||
"* **CreditHistory:** Credit history meets guidelines\n",
|
||||
"* **LoanStatus** (Target)**:** Loan approved (Y/N)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "702ce4e6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Data preprocessing"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7fce724c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Define a list of categorical columns to encode."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "2c56efa5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"categorical_columns = [\"Gender\", \"Married\", \"Education\", \"LoanStatus\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d8915a68",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Encode categorical columns using the [`OrdinalEncoder`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html) of scikit learn."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "dc5f9cda",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data[categorical_columns] = OrdinalEncoder().fit_transform(data[categorical_columns])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "df9c84b4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Split into `X` and `y`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "83beacfb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X = data.drop(columns=\"LoanStatus\")\n",
|
||||
"y = data.LoanStatus"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e25c8f24",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Normalize data using the [`StandardScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) of scikit learn."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "9c567bb7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X[X.columns] = StandardScaler().fit_transform(X[X.columns])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7437ea21",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Convert `y` type to `int` "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "c0db7c1f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y = y.astype(int)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6d1d1f10",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Split dataset into train and test sets."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "b05be2cc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8f6d3ce6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## b. Dummy classifier"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "80ec4058",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Build a dummy classifier that takes decisions randomly."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "30919672",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class DummyClassifier():\n",
|
||||
" \n",
|
||||
" def __init__(self):\n",
|
||||
" \"\"\"\n",
|
||||
" Initialize the class.\n",
|
||||
" \"\"\"\n",
|
||||
" pass\n",
|
||||
" \n",
|
||||
" def fit(self, X, y):\n",
|
||||
" \"\"\"\n",
|
||||
" Fit the dummy classifier.\n",
|
||||
" \n",
|
||||
" Parameters\n",
|
||||
" ----------\n",
|
||||
" X : Numpy array or Pandas DataFrame of shape (n_samples, n_features)\n",
|
||||
" Training data.\n",
|
||||
" y : Numpy array or Pandas DataFrame of shape (n_samples,)\n",
|
||||
" Target values.\n",
|
||||
" \"\"\"\n",
|
||||
" pass\n",
|
||||
" \n",
|
||||
" def predict(self, X):\n",
|
||||
" \"\"\"\n",
|
||||
" Predict the class labels for the provided data.\n",
|
||||
"\n",
|
||||
" Parameters\n",
|
||||
" ----------\n",
|
||||
" X : Numpy array or Pandas DataFrame of shape (n_queries, n_features)\n",
|
||||
" Test samples.\n",
|
||||
"\n",
|
||||
" Returns\n",
|
||||
" -------\n",
|
||||
" y : Numpy array or Pandas DataFrame of shape (n_queries,)\n",
|
||||
" Class labels for each data sample.\n",
|
||||
" \"\"\"\n",
|
||||
" pass"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1dd67c48",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Implement a function to evaluate the performance of a classification by computing the accuracy ($N_{correct}/N$)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "184f3905",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def accuracy_score(y_true, y_pred):\n",
|
||||
" pass"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "90dcae17",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Compute the performance of the dummy classifier using the provided test set."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fa666b66",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9e10cd97",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## c. K-Nearest Neighbors classifier"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "70009457",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Build a K-Nearest Neighbors classifier using an Euclidian distance computation and a simple majority voting criterion."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "759e924e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class KNNClassifier():\n",
|
||||
" \n",
|
||||
" def __init__(self, n_neighbors=3):\n",
|
||||
" \"\"\"\n",
|
||||
" Initialize the class.\n",
|
||||
" \n",
|
||||
" Parameters\n",
|
||||
" ----------\n",
|
||||
" n_neighbors : int, default=3\n",
|
||||
" Number of neighbors to use by default.\n",
|
||||
" \"\"\"\n",
|
||||
" pass\n",
|
||||
" \n",
|
||||
" def fit(self, X, y):\n",
|
||||
" \"\"\"\n",
|
||||
" Fit the k-nearest neighbors classifier.\n",
|
||||
" \n",
|
||||
" Parameters\n",
|
||||
" ----------\n",
|
||||
" X : Numpy array or Pandas DataFrame of shape (n_samples, n_features)\n",
|
||||
" Training data.\n",
|
||||
" y : Numpy array or Pandas DataFrame of shape (n_samples,)\n",
|
||||
" Target values.\n",
|
||||
" \"\"\"\n",
|
||||
" pass\n",
|
||||
" \n",
|
||||
" @staticmethod\n",
|
||||
" def _euclidian_distance(a, b):\n",
|
||||
" \"\"\"\n",
|
||||
" Utility function to compute the euclidian distance.\n",
|
||||
" \n",
|
||||
" Parameters\n",
|
||||
" ----------\n",
|
||||
" a : Numpy array or Pandas DataFrame\n",
|
||||
" First operand.\n",
|
||||
" b : Numpy array or Pandas DataFrame\n",
|
||||
" Second operand.\n",
|
||||
" \"\"\"\n",
|
||||
" pass\n",
|
||||
" \n",
|
||||
" def predict(self, X):\n",
|
||||
" \"\"\"\n",
|
||||
" Predict the class labels for the provided data.\n",
|
||||
"\n",
|
||||
" Parameters\n",
|
||||
" ----------\n",
|
||||
" X : Numpy array or Pandas DataFrame of shape (n_queries, n_features)\n",
|
||||
" Test samples.\n",
|
||||
"\n",
|
||||
" Returns\n",
|
||||
" -------\n",
|
||||
" y : Numpy array or Pandas DataFrame of shape (n_queries,)\n",
|
||||
" Class labels for each data sample.\n",
|
||||
" \"\"\"\n",
|
||||
" pass"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6c2b4811",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Compute the performance of the system as a function of $k = 1...7$."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cf589e66",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "71c51f35",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Run the KNN algorithm using only the features `TotalIncome` and `CreditHistory`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2f6f262b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e2b1a682",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Re-run the KNN algorithm using the features `TotalIncome`, `CreditHistory` and `Married`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c0bda7ee",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2724167a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Re-run the KNN algorithm using all features."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "46ec9699",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "648aa52e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
666
PW-2/ex3-knn-mnist/knn-mnist-stud.ipynb
Normal file
666
PW-2/ex3-knn-mnist/knn-mnist-stud.ipynb
Normal file
@@ -0,0 +1,666 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# k-Nearest Neighbor (kNN) exercise 3 - MNIST Dataset\n",
|
||||
"\n",
|
||||
"*Complete and hand in this completed worksheet.*\n",
|
||||
"\n",
|
||||
"The kNN classifier consists of two stages:\n",
|
||||
"\n",
|
||||
"- During training, the classifier takes the training data and simply remembers it\n",
|
||||
"- During testing, kNN classifies every test image by comparing to all training images and transfering the labels of the k most similar training examples\n",
|
||||
"- In this exercise, the ultimate goal is to find an optimal value of hyper-parameter k through a cross-validation procedure.\n",
|
||||
"\n",
|
||||
"In this exercise you will implement these steps and gain proficiency in writing efficient, vectorized code."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Run some setup code for this notebook.\n",
|
||||
"\n",
|
||||
"import numpy as np\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import pandas as pd\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"# This is a bit of magic to make matplotlib figures appear inline in the notebook\n",
|
||||
"# rather than in a new window. Also setting some parameters for display.\n",
|
||||
"%matplotlib inline\n",
|
||||
"plt.rcParams['figure.figsize'] = (10.0, 10.0) # set default size of plots\n",
|
||||
"plt.rcParams['image.interpolation'] = 'nearest'\n",
|
||||
"plt.rcParams['image.cmap'] = 'gray'\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# This is a method to read the MNIST dataset from a ROOT directory\n",
|
||||
"def load_MNIST(ROOT):\n",
|
||||
" '''load all of mnist\n",
|
||||
" training set first'''\n",
|
||||
" Xtr = []\n",
|
||||
" train = pd.read_csv(os.path.join(ROOT, 'mnist_train.csv'))\n",
|
||||
" X = np.array(train.drop('label', axis=1))\n",
|
||||
" Ytr = np.array(train['label'])\n",
|
||||
" # With this for-loop we give the data a shape of the acctual image (28x28)\n",
|
||||
" # instead of the shape in file (1x784)\n",
|
||||
" for row in X:\n",
|
||||
" Xtr.append(row.reshape(28,28))\n",
|
||||
" # load test set second\n",
|
||||
" Xte = []\n",
|
||||
" test = pd.read_csv(os.path.join(ROOT, 'mnist_test.csv'))\n",
|
||||
" X = np.array(test.drop('label', axis=1))\n",
|
||||
" Yte = np.array(test['label'])\n",
|
||||
" # same reshaping\n",
|
||||
" for row in X:\n",
|
||||
" Xte.append(row.reshape(28,28))\n",
|
||||
" \n",
|
||||
" return np.array(Xtr), np.array(Ytr), np.array(Xte), np.array(Yte)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load the raw MNIST data.\n",
|
||||
"mnist_dir = 'YOUR-MNIST-DIR-HERE' # TODO: update this dir information to your own dir\n",
|
||||
"X_train, y_train, X_test, y_test = load_MNIST(mnist_dir)\n",
|
||||
"\n",
|
||||
"# As a sanity check, we print out the size of the training and test data.\n",
|
||||
"print('Training data shape: ', X_train.shape)\n",
|
||||
"print('Training labels shape: ', y_train.shape)\n",
|
||||
"print('Test data shape: ', X_test.shape)\n",
|
||||
"print('Test labels shape: ', y_test.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Inline Question #1:** Notice the outputs of the shape attributes for the numpy arrays downloaded.\n",
|
||||
"\n",
|
||||
"- What are the ranks of the arrays for the training data and test data?\n",
|
||||
"- Are the shapes coherent from the description of the dataset that we can find [here](http://yann.lecun.com/exdb/mnist/)? Explain the different dimensions of the 4 arrays in cell above."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Your Answer**: *fill this in.*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Now let's visualise some of the images\n",
|
||||
"classes = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']\n",
|
||||
"num_classes = len(classes)\n",
|
||||
"samples_per_class = 7\n",
|
||||
"for y, cls in enumerate(classes): # y and cls takes values from 0-9\n",
|
||||
" idxs = np.flatnonzero(y_train == y) # gets the indices of samples that corresponds to class y\n",
|
||||
" idxs = np.random.choice(idxs, samples_per_class, replace=False) # picks randomly samples_per_class indices\n",
|
||||
" for i, idx in enumerate(idxs):\n",
|
||||
" plt_idx = i * num_classes + y + 1 # determines the sub-plot index\n",
|
||||
" plt.subplot(samples_per_class, num_classes, plt_idx)\n",
|
||||
" plt.imshow(X_train[idx].astype('uint8'))\n",
|
||||
" plt.axis('off')\n",
|
||||
" if i == 0:\n",
|
||||
" plt.title(cls)\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Subsample the data for more efficient code execution in this exercise. We do this to make it go faster. \n",
|
||||
"# When you will have completed the whole notebook, you can run it again on a larger (or total) dataset \n",
|
||||
"# and observe the difference in terms of accuracy (and speedup).\n",
|
||||
"num_training = 5000\n",
|
||||
"mask = range(num_training)\n",
|
||||
"X_train = X_train[mask]\n",
|
||||
"y_train = y_train[mask]\n",
|
||||
"\n",
|
||||
"num_test = 500\n",
|
||||
"mask = range(num_test)\n",
|
||||
"X_test = X_test[mask]\n",
|
||||
"y_test = y_test[mask]\n",
|
||||
"\n",
|
||||
"# TODO: sanity check: write code to print out the size of the subsampled training and test data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Shape the images vectors\n",
|
||||
"X_train = np.reshape(X_train, (X_train.shape[0], -1)) # when reshaping, -1 means \"infer target dims from orig dims\n",
|
||||
"X_test = np.reshape(X_test, (X_test.shape[0], -1)) # in this case it flattens the (28,28,3) into 3072 \n",
|
||||
"print(X_train.shape, X_test.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Inline Question #2:** Notice the use of np.reshape to transform images into vectors.\n",
|
||||
"\n",
|
||||
"- What is the effect of -1 in the reshape command?\n",
|
||||
"- Are the shapes coherent from this vectorization? Explain."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Your Answer**: *fill this in.*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# This is a class definition for our KNN classifier. Complete the code indicated by the TODO sections.\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"class KNearestNeighbor(object):\n",
|
||||
" \"\"\" a kNN classifier with L2 distance \"\"\"\n",
|
||||
"\n",
|
||||
" def __init__(self):\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" def train(self, X, y):\n",
|
||||
" \"\"\"\n",
|
||||
" Train the classifier. For k-nearest neighbors this is just \n",
|
||||
" memorizing the training data.\n",
|
||||
"\n",
|
||||
" Inputs:\n",
|
||||
" - X: A numpy array of shape (num_train, D) containing the training data\n",
|
||||
" consisting of num_train samples each of dimension D.\n",
|
||||
" - y: A numpy array of shape (N,) containing the training labels, where\n",
|
||||
" y[i] is the label for X[i].\n",
|
||||
" \"\"\"\n",
|
||||
" self.X_train = X\n",
|
||||
" self.y_train = y\n",
|
||||
" \n",
|
||||
" def predict(self, X, k=1, num_loops=0):\n",
|
||||
" \"\"\"\n",
|
||||
" Predict labels for test data using this classifier.\n",
|
||||
"\n",
|
||||
" Inputs:\n",
|
||||
" - X: A numpy array of shape (num_test, D) containing test data consisting\n",
|
||||
" of num_test samples each of dimension D.\n",
|
||||
" - k: The number of nearest neighbors that vote for the predicted labels.\n",
|
||||
" - num_loops: Determines which implementation to use to compute distances\n",
|
||||
" between training points and testing points.\n",
|
||||
"\n",
|
||||
" Returns:\n",
|
||||
" - y: A numpy array of shape (num_test,) containing predicted labels for the\n",
|
||||
" test data, where y[i] is the predicted label for the test point X[i]. \n",
|
||||
" \"\"\"\n",
|
||||
" if num_loops == 0:\n",
|
||||
" dists = self.compute_distances_no_loops(X)\n",
|
||||
" elif num_loops == 1:\n",
|
||||
" dists = self.compute_distances_one_loop(X)\n",
|
||||
" elif num_loops == 2:\n",
|
||||
" dists = self.compute_distances_two_loops(X)\n",
|
||||
" else:\n",
|
||||
" raise ValueError('Invalid value %d for num_loops' % num_loops)\n",
|
||||
"\n",
|
||||
" return self.predict_labels(dists, k=k)\n",
|
||||
"\n",
|
||||
" def compute_distances_two_loops(self, X):\n",
|
||||
" \"\"\"\n",
|
||||
" Compute the distance between each test point in X and each training point\n",
|
||||
" in self.X_train using a nested loop over both the training data and the \n",
|
||||
" test data.\n",
|
||||
"\n",
|
||||
" Inputs:\n",
|
||||
" - X: A numpy array of shape (num_test, D) containing test data.\n",
|
||||
"\n",
|
||||
" Returns:\n",
|
||||
" - dists: A numpy array of shape (num_test, num_train) where dists[i, j]\n",
|
||||
" is the Euclidean distance between the ith test point and the jth training\n",
|
||||
" point.\n",
|
||||
" \"\"\"\n",
|
||||
" num_test = X.shape[0]\n",
|
||||
" num_train = self.X_train.shape[0]\n",
|
||||
" dists = np.zeros((num_test, num_train))\n",
|
||||
" for i in range(num_test):\n",
|
||||
" for j in range(num_train):\n",
|
||||
" #####################################################################\n",
|
||||
" # TODO: #\n",
|
||||
" # Compute the l2 distance between the ith test point and the jth #\n",
|
||||
" # training point, and store the result in dists[i, j]. You should #\n",
|
||||
" # not use a loop over dimension. #\n",
|
||||
" #####################################################################\n",
|
||||
" pass\n",
|
||||
" \n",
|
||||
" #####################################################################\n",
|
||||
" # END OF YOUR CODE #\n",
|
||||
" #####################################################################\n",
|
||||
" return dists\n",
|
||||
"\n",
|
||||
" def compute_distances_one_loop(self, X):\n",
|
||||
" \"\"\"\n",
|
||||
" Compute the distance between each test point in X and each training point\n",
|
||||
" in self.X_train using a single loop over the test data.\n",
|
||||
"\n",
|
||||
" Input / Output: Same as compute_distances_two_loops\n",
|
||||
" \"\"\"\n",
|
||||
" num_test = X.shape[0]\n",
|
||||
" num_train = self.X_train.shape[0]\n",
|
||||
" dists = np.zeros((num_test, num_train))\n",
|
||||
" for i in range(num_test):\n",
|
||||
" #######################################################################\n",
|
||||
" # TODO: #\n",
|
||||
" # Compute the l2 distance between the ith test point and all training #\n",
|
||||
" # points, and store the result in dists[i, :]. #\n",
|
||||
" #######################################################################\n",
|
||||
" pass\n",
|
||||
" \n",
|
||||
" #######################################################################\n",
|
||||
" # END OF YOUR CODE #\n",
|
||||
" #######################################################################\n",
|
||||
" return dists\n",
|
||||
"\n",
|
||||
" def compute_distances_no_loops(self, X):\n",
|
||||
" \"\"\"\n",
|
||||
" Compute the distance between each test point in X and each training point\n",
|
||||
" in self.X_train using no explicit loops.\n",
|
||||
"\n",
|
||||
" Input / Output: Same as compute_distances_two_loops\n",
|
||||
" \"\"\"\n",
|
||||
" num_test = X.shape[0]\n",
|
||||
" num_train = self.X_train.shape[0]\n",
|
||||
" dists = np.zeros((num_test, num_train)) \n",
|
||||
" #########################################################################\n",
|
||||
" # TODO: #\n",
|
||||
" # Compute the l2 distance between all test points and all training #\n",
|
||||
" # points without using any explicit loops, and store the result in #\n",
|
||||
" # dists. #\n",
|
||||
" # #\n",
|
||||
" # You should implement this function using only basic array operations; #\n",
|
||||
" # in particular you should not use functions from scipy. #\n",
|
||||
" # #\n",
|
||||
" # HINT: Try to formulate the l2 distance using matrix multiplication #\n",
|
||||
" # and two broadcast sums. #\n",
|
||||
" #########################################################################\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" #########################################################################\n",
|
||||
" # END OF YOUR CODE #\n",
|
||||
" #########################################################################\n",
|
||||
" return dists\n",
|
||||
"\n",
|
||||
" def predict_labels(self, dists, k=1):\n",
|
||||
" \"\"\"\n",
|
||||
" Given a matrix of distances between test points and training points,\n",
|
||||
" predict a label for each test point.\n",
|
||||
"\n",
|
||||
" Inputs:\n",
|
||||
" - dists: A numpy array of shape (num_test, num_train) where dists[i, j]\n",
|
||||
" gives the distance betwen the ith test point and the jth training point.\n",
|
||||
"\n",
|
||||
" Returns:\n",
|
||||
" - y: A numpy array of shape (num_test,) containing predicted labels for the\n",
|
||||
" test data, where y[i] is the predicted label for the test point X[i]. \n",
|
||||
" \"\"\"\n",
|
||||
" num_test = dists.shape[0]\n",
|
||||
" y_pred = np.zeros(num_test)\n",
|
||||
" for i in range(num_test):\n",
|
||||
" # A list of length k storing the labels of the k nearest neighbors to\n",
|
||||
" # the ith test point.\n",
|
||||
" closest_y = []\n",
|
||||
" #########################################################################\n",
|
||||
" # TODO: #\n",
|
||||
" # Use the distance matrix to find the k nearest neighbors of the ith #\n",
|
||||
" # testing point, and use self.y_train to find the labels of these #\n",
|
||||
" # neighbors. Store these labels in closest_y. #\n",
|
||||
" # Hint: Look up the function numpy.argsort. #\n",
|
||||
" #########################################################################\n",
|
||||
" pass\n",
|
||||
" \n",
|
||||
" #########################################################################\n",
|
||||
" # TODO: #\n",
|
||||
" # Now that you have found the labels of the k nearest neighbors, you #\n",
|
||||
" # need to find the most common label in the list closest_y of labels. #\n",
|
||||
" # Store this label in y_pred[i]. Break ties by choosing the smaller #\n",
|
||||
" # label. #\n",
|
||||
" #########################################################################\n",
|
||||
" pass\n",
|
||||
" \n",
|
||||
" #########################################################################\n",
|
||||
" # END OF YOUR CODE # \n",
|
||||
" #########################################################################\n",
|
||||
"\n",
|
||||
" return y_pred"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create a kNN classifier instance. \n",
|
||||
"# Remember that training a kNN classifier is a noop: \n",
|
||||
"# the Classifier simply remembers the data and does no further processing \n",
|
||||
"classifier = KNearestNeighbor()\n",
|
||||
"classifier.train(X_train, y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# TODO: implement compute_distances_two_loops from the knn class definition above\n",
|
||||
"\n",
|
||||
"# Test your implementation:\n",
|
||||
"dists = classifier.compute_distances_two_loops(X_test)\n",
|
||||
"print(dists.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# We can visualize the distance matrix: each row is a single test example and\n",
|
||||
"# its distances to training examples\n",
|
||||
"plt.imshow(dists, interpolation='none')\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Inline Question #3:** Notice the structured patterns in the distance matrix, where some rows or columns are visible brighter. (Note that with the default color scheme black indicates low distances while white indicates high distances.)\n",
|
||||
"\n",
|
||||
"- What in the data is the cause behind the distinctly bright rows?\n",
|
||||
"- What causes the bright columns?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Your Answer**: *fill this in.*\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# TODO : Now implement the function predict_labels from the KNN class above and run the code below:\n",
|
||||
"# We use k = 1 (which is Nearest Neighbor).\n",
|
||||
"y_test_pred = classifier.predict_labels(dists, k=1)\n",
|
||||
"\n",
|
||||
"# Compute and print the fraction of correctly predicted examples\n",
|
||||
"num_correct = np.sum(y_test_pred == y_test)\n",
|
||||
"accuracy = float(num_correct) / num_test\n",
|
||||
"print('Got %d / %d correct => accuracy: %f' % (int(num_correct), num_test, accuracy))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You should expect to see approximately `90%` accuracy. Now lets try out a larger `k`, say `k = 5`:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y_test_pred = classifier.predict_labels(dists, k=5)\n",
|
||||
"num_correct = np.sum(y_test_pred == y_test)\n",
|
||||
"accuracy = float(num_correct) / num_test\n",
|
||||
"print('Got %d / %d correct => accuracy: %f' % (int(num_correct), num_test, accuracy))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You should expect to see a slightly better performance than with `k = 1`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Now lets speed up distance matrix computation by using partial vectorization\n",
|
||||
"# with one loop. Implement the function compute_distances_one_loop and run the\n",
|
||||
"# code below:\n",
|
||||
"dists_one = classifier.compute_distances_one_loop(X_test)\n",
|
||||
"\n",
|
||||
"# To ensure that our vectorized implementation is correct, we make sure that it\n",
|
||||
"# agrees with the naive implementation. There are many ways to decide whether\n",
|
||||
"# two matrices are similar; one of the simplest is the Frobenius norm. In case\n",
|
||||
"# you haven't seen it before, the Frobenius norm of two matrices is the square\n",
|
||||
"# root of the squared sum of differences of all elements; in other words, reshape\n",
|
||||
"# the matrices into vectors and compute the Euclidean distance between them.\n",
|
||||
"difference = np.linalg.norm(dists - dists_one, ord='fro')\n",
|
||||
"print('Difference was: %f' % (difference, ))\n",
|
||||
"if difference < 0.001:\n",
|
||||
" print('Good! The distance matrices are the same')\n",
|
||||
"else:\n",
|
||||
" print('Uh-oh! The distance matrices are different')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Now implement the fully vectorized version inside compute_distances_no_loops\n",
|
||||
"# and run the code\n",
|
||||
"dists_two = classifier.compute_distances_no_loops(X_test)\n",
|
||||
"\n",
|
||||
"# check that the distance matrix agrees with the one we computed before:\n",
|
||||
"difference = np.linalg.norm(dists - dists_two, ord='fro')\n",
|
||||
"print('Difference was: %f' % (difference, ))\n",
|
||||
"if difference < 0.001:\n",
|
||||
" print('Good! The distance matrices are the same')\n",
|
||||
"else:\n",
|
||||
" print('Uh-oh! The distance matrices are different')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Let's compare how fast the implementations are\n",
|
||||
"def time_function(f, *args):\n",
|
||||
" \"\"\"\n",
|
||||
" Call a function f with args and return the time (in seconds) that it took to execute.\n",
|
||||
" \"\"\"\n",
|
||||
" import time\n",
|
||||
" tic = time.time()\n",
|
||||
" f(*args)\n",
|
||||
" toc = time.time()\n",
|
||||
" return toc - tic\n",
|
||||
"\n",
|
||||
"two_loop_time = time_function(classifier.compute_distances_two_loops, X_test)\n",
|
||||
"print('Two loop version took %f seconds' % two_loop_time)\n",
|
||||
"\n",
|
||||
"one_loop_time = time_function(classifier.compute_distances_one_loop, X_test)\n",
|
||||
"print('One loop version took %f seconds' % one_loop_time)\n",
|
||||
"\n",
|
||||
"no_loop_time = time_function(classifier.compute_distances_no_loops, X_test)\n",
|
||||
"print('No loop version took %f seconds' % no_loop_time)\n",
|
||||
"\n",
|
||||
"# you should see significantly faster performance with the fully vectorized implementation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Cross-validation\n",
|
||||
"\n",
|
||||
"We have implemented the k-Nearest Neighbor classifier but we set the value k = 5 arbitrarily. We will now determine the best value of this hyperparameter with cross-validation."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"num_folds = 5\n",
|
||||
"k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50]\n",
|
||||
"\n",
|
||||
"#X_train_folds = []\n",
|
||||
"#y_train_folds = []\n",
|
||||
"################################################################################\n",
|
||||
"# TODO: #\n",
|
||||
"# Split up the training data into folds. After splitting, X_train_folds and #\n",
|
||||
"# y_train_folds should each be lists of length num_folds, where #\n",
|
||||
"# y_train_folds[i] is the label vector for the points in X_train_folds[i]. #\n",
|
||||
"# Hint: Look up the numpy array_split function. #\n",
|
||||
"################################################################################\n",
|
||||
"pass\n",
|
||||
"\n",
|
||||
"################################################################################\n",
|
||||
"# END OF YOUR CODE #\n",
|
||||
"################################################################################\n",
|
||||
"\n",
|
||||
"# A dictionary holding the accuracies for different values of k that we find\n",
|
||||
"# when running cross-validation. After running cross-validation,\n",
|
||||
"# k_to_accuracies[k] should be a list of length num_folds giving the different\n",
|
||||
"# accuracy values that we found when using that value of k.\n",
|
||||
"k_to_accuracies = {}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"################################################################################\n",
|
||||
"# TODO: #\n",
|
||||
"# Perform k-fold cross validation to find the best value of k. For each #\n",
|
||||
"# possible value of k, run the k-nearest-neighbor algorithm num_folds times, #\n",
|
||||
"# where in each case you use all but one of the folds as training data and the #\n",
|
||||
"# last fold as a validation set. Store the accuracies for all fold and all #\n",
|
||||
"# values of k in the k_to_accuracies dictionary. #\n",
|
||||
"################################################################################\n",
|
||||
"pass\n",
|
||||
"\n",
|
||||
"################################################################################\n",
|
||||
"# END OF YOUR CODE #\n",
|
||||
"################################################################################\n",
|
||||
"\n",
|
||||
"# Print out the computed accuracies\n",
|
||||
"for k in sorted(k_to_accuracies):\n",
|
||||
" for accuracy in k_to_accuracies[k]:\n",
|
||||
" print('k = %d, accuracy = %f' % (k, accuracy))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# plot the raw observations\n",
|
||||
"for k in k_choices:\n",
|
||||
" accuracies = k_to_accuracies[k]\n",
|
||||
" plt.scatter([k] * len(accuracies), accuracies)\n",
|
||||
"\n",
|
||||
"# plot the trend line with error bars that correspond to standard deviation\n",
|
||||
"accuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())])\n",
|
||||
"accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())])\n",
|
||||
"plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)\n",
|
||||
"plt.title('Cross-validation on k')\n",
|
||||
"plt.xlabel('k')\n",
|
||||
"plt.ylabel('Cross-validation accuracy')\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Based on the cross-validation results above, choose the best value for k, \n",
|
||||
"# retrain the classifier using all the training data, and test it on the test\n",
|
||||
"# data. You should be able to get above 90% accuracy on the test data.\n",
|
||||
"best_k = 1 # TODO: put your best k value here\n",
|
||||
"\n",
|
||||
"classifier = KNearestNeighbor()\n",
|
||||
"classifier.train(X_train, y_train)\n",
|
||||
"y_test_pred = classifier.predict(X_test, k=best_k)\n",
|
||||
"\n",
|
||||
"# Compute and display the accuracy\n",
|
||||
"num_correct = np.sum(y_test_pred == y_test)\n",
|
||||
"accuracy = float(num_correct) / num_test\n",
|
||||
"print('Got %d / %d correct => accuracy: %f' % (int(num_correct), num_test, accuracy))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
}
|
||||
10001
PW-2/ex3-knn-mnist/mnist/mnist_test.csv
Normal file
10001
PW-2/ex3-knn-mnist/mnist/mnist_test.csv
Normal file
File diff suppressed because it is too large
Load Diff
28
PW-2/ex5-regression-knn/lifesat.csv
Normal file
28
PW-2/ex5-regression-knn/lifesat.csv
Normal file
@@ -0,0 +1,28 @@
|
||||
Country,GDP per capita (USD),Life satisfaction
|
||||
Russia,26456.3879381321,5.8
|
||||
Greece,27287.0834009302,5.4
|
||||
Turkey,28384.9877846263,5.5
|
||||
Latvia,29932.4939100562,5.9
|
||||
Hungary,31007.7684065437,5.6
|
||||
Portugal,32181.1545372343,5.4
|
||||
Poland,32238.157259275,6.1
|
||||
Estonia,35638.4213511812,5.7
|
||||
Spain,36215.4475907307,6.3
|
||||
Slovenia,36547.7389559849,5.9
|
||||
Lithuania,36732.034744031,5.9
|
||||
Israel,38341.3075704083,7.2
|
||||
Italy,38992.1483807498,6.0
|
||||
United Kingdom,41627.129269425,6.8
|
||||
France,42025.6173730617,6.5
|
||||
New Zealand,42404.3937381567,7.3
|
||||
Canada,45856.6256264804,7.4
|
||||
Finland,47260.800458441,7.6
|
||||
Belgium,48210.0331113444,6.9
|
||||
Australia,48697.8370282475,7.3
|
||||
Sweden,50683.3235097178,7.3
|
||||
Germany,50922.3580234484,7.0
|
||||
Austria,51935.6038618156,7.1
|
||||
Iceland,52279.7288513646,7.5
|
||||
Netherlands,54209.5638357302,7.4
|
||||
Denmark,55938.2128086032,7.6
|
||||
United States,60235.7284916969,6.9
|
||||
|
258
PW-2/ex5-regression-knn/regression-knn-stud.ipynb
Normal file
258
PW-2/ex5-regression-knn/regression-knn-stud.ipynb
Normal file
@@ -0,0 +1,258 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b94b0451",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"from sklearn.linear_model import LinearRegression\n",
|
||||
"\n",
|
||||
"# Download and prepare the data\n",
|
||||
"lifesat = pd.read_csv(\"lifesat.csv\")\n",
|
||||
"X = lifesat[[\"GDP per capita (USD)\"]].values\n",
|
||||
"y = lifesat[[\"Life satisfaction\"]].values\n",
|
||||
"\n",
|
||||
"# Visualize the data\n",
|
||||
"lifesat.plot(kind='scatter', grid=True,\n",
|
||||
" x=\"GDP per capita (USD)\", y=\"Life satisfaction\")\n",
|
||||
"plt.axis([23_500, 62_500, 4, 9])\n",
|
||||
"plt.show()\n",
|
||||
"\n",
|
||||
"# Select a linear model\n",
|
||||
"model = LinearRegression()\n",
|
||||
"\n",
|
||||
"# Train the model\n",
|
||||
"model.fit(X, y)\n",
|
||||
"\n",
|
||||
"# Make a prediction for Cyprus\n",
|
||||
"X_new = [[37_655.2]] # Cyprus' GDP per capita in 2020\n",
|
||||
"print(model.predict(X_new)) # outputs [[6.30165767]]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "94fda07f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_test = np.linspace(25000, 60000, 200)\n",
|
||||
"X_test = [[value] for value in X_test]\n",
|
||||
"y_test = model.predict(X_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "838b0242",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Visualize the data\n",
|
||||
"lifesat.plot(kind='scatter', grid=True,\n",
|
||||
" x=\"GDP per capita (USD)\", y=\"Life satisfaction\")\n",
|
||||
"plt.axis([23_500, 62_500, 4, 9])\n",
|
||||
"plt.plot(X_test, y_test, color='red')\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "aa14a4ca",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class KNearestNeighborRegressor(object):\n",
|
||||
" \"\"\" a kNN regressor with L2 distance \"\"\"\n",
|
||||
"\n",
|
||||
" def __init__(self):\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" def train(self, X, y):\n",
|
||||
" \"\"\"\n",
|
||||
" Train the classifier. For k-nearest neighbors this is just \n",
|
||||
" memorizing the training data.\n",
|
||||
"\n",
|
||||
" Inputs:\n",
|
||||
" - X: A numpy array of shape (num_train, D) containing the training data\n",
|
||||
" consisting of num_train samples each of dimension D.\n",
|
||||
" - y: A numpy array of shape (N,) containing the training labels, where\n",
|
||||
" y[i] is the label for X[i].\n",
|
||||
" \"\"\"\n",
|
||||
" self.X_train = X\n",
|
||||
" self.y_train = y\n",
|
||||
" \n",
|
||||
" def predict(self, X, k=1):\n",
|
||||
" \"\"\"\n",
|
||||
" Predict labels for test data using this classifier.\n",
|
||||
"\n",
|
||||
" Inputs:\n",
|
||||
" - X: A numpy array of shape (num_test, D) containing test data consisting\n",
|
||||
" of num_test samples each of dimension D.\n",
|
||||
" - k: The number of nearest neighbors that vote for the predicted labels.\n",
|
||||
" - num_loops: Determines which implementation to use to compute distances\n",
|
||||
" between training points and testing points.\n",
|
||||
"\n",
|
||||
" Returns:\n",
|
||||
" - y: A numpy array of shape (num_test,) containing predicted labels for the\n",
|
||||
" test data, where y[i] is the predicted label for the test point X[i]. \n",
|
||||
" \"\"\"\n",
|
||||
" dists = self.compute_distances(X)\n",
|
||||
" \n",
|
||||
" return self.predict_values(dists, k=k)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" def compute_distances(self, X):\n",
|
||||
" \"\"\"\n",
|
||||
" Compute the distance between each test point in X and each training point\n",
|
||||
" in self.X_train using a single loop over the test data.\n",
|
||||
"\n",
|
||||
" Inputs:\n",
|
||||
" - X: A numpy array of shape (num_test, D) containing test data.\n",
|
||||
"\n",
|
||||
" Returns:\n",
|
||||
" - dists: A numpy array of shape (num_test, num_train) where dists[i, j]\n",
|
||||
" is the Euclidean distance between the ith test point and the jth training\n",
|
||||
" point.\n",
|
||||
" \"\"\"\n",
|
||||
" num_test = X.shape[0]\n",
|
||||
" num_train = self.X_train.shape[0]\n",
|
||||
" dists = np.zeros((num_test, num_train))\n",
|
||||
" for i in range(num_test):\n",
|
||||
" #######################################################################\n",
|
||||
" # TODO: #\n",
|
||||
" # Compute the l2 distance between the ith test point and all training #\n",
|
||||
" # points, and store the result in dists[i, :]. #\n",
|
||||
" #######################################################################\n",
|
||||
" \n",
|
||||
" pass\n",
|
||||
" \n",
|
||||
" #######################################################################\n",
|
||||
" # END OF YOUR CODE #\n",
|
||||
" #######################################################################\n",
|
||||
" return dists\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" def predict_values(self, dists, k=1):\n",
|
||||
" \"\"\"\n",
|
||||
" Given a matrix of distances between test points and training points,\n",
|
||||
" predict a value for each test point.\n",
|
||||
"\n",
|
||||
" Inputs:\n",
|
||||
" - dists: A numpy array of shape (num_test, num_train) where dists[i, j]\n",
|
||||
" gives the distance betwen the ith test point and the jth training point.\n",
|
||||
"\n",
|
||||
" Returns:\n",
|
||||
" - y: A numpy array of shape (num_test,) containing predicted values for the\n",
|
||||
" test data, where y[i] is the predicted value for the test point X[i]. \n",
|
||||
" \"\"\"\n",
|
||||
" num_test = dists.shape[0]\n",
|
||||
" y_pred = np.zeros(num_test)\n",
|
||||
" for i in range(num_test):\n",
|
||||
" # A list of length k storing the labels of the k nearest neighbors to\n",
|
||||
" # the ith test point.\n",
|
||||
" closest_y = []\n",
|
||||
" \n",
|
||||
" #########################################################################\n",
|
||||
" # TODO: #\n",
|
||||
" # Use the distance matrix to find the k nearest neighbors of the ith #\n",
|
||||
" # testing point, and use self.y_train to find the labels of these #\n",
|
||||
" # neighbors. Store these labels in closest_y. #\n",
|
||||
" # Hint: Look up the function numpy.argsort. #\n",
|
||||
" #########################################################################\n",
|
||||
" \n",
|
||||
" pass\n",
|
||||
" \n",
|
||||
" #########################################################################\n",
|
||||
" # TODO: #\n",
|
||||
" # Now that you have found the labels of the k nearest neighbors, you #\n",
|
||||
" # need to compute the average of the target values corresponding to the #\n",
|
||||
" # nearest neighbors. #\n",
|
||||
" #########################################################################\n",
|
||||
" \n",
|
||||
" pass\n",
|
||||
" \n",
|
||||
" #########################################################################\n",
|
||||
" # END OF YOUR CODE # \n",
|
||||
" #########################################################################\n",
|
||||
"\n",
|
||||
" return y_pred"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "267d1168",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"knn_reg = KNearestNeighborRegressor()\n",
|
||||
"knn_reg.train(np.array(X), y)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fd8203ba",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y_hat_1 = knn_reg.predict(np.array(X_test), k=1)\n",
|
||||
"y_hat_3 = knn_reg.predict(np.array(X_test), k=3)\n",
|
||||
"y_hat_5 = knn_reg.predict(np.array(X_test), k=5)\n",
|
||||
"y_hat_7 = knn_reg.predict(np.array(X_test), k=7)\n",
|
||||
"y_hat_20 = knn_reg.predict(np.array(X_test), k=20)\n",
|
||||
"y_hat_27 = knn_reg.predict(np.array(X_test), k=27)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d3704256",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Visualize the data\n",
|
||||
"lifesat.plot(kind='scatter', grid=True,\n",
|
||||
" x=\"GDP per capita (USD)\", y=\"Life satisfaction\")\n",
|
||||
"plt.axis([23_500, 62_500, 4, 9])\n",
|
||||
"plt.plot(X_test, y_test, color='red')\n",
|
||||
"plt.plot(X_test, y_hat_1, color='green')\n",
|
||||
"# plt.plot(X_test, y_hat_3, color='blue')\n",
|
||||
"# plt.plot(X_test, y_hat_5, color='magenta')\n",
|
||||
"# plt.plot(X_test, y_hat_7, color='orange')\n",
|
||||
"# plt.plot(X_test, y_hat_20, color='black')\n",
|
||||
"# plt.plot(X_test, y_hat_27, color='grey')\n",
|
||||
"plt.show()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user