Started PW-2

This commit is contained in:
gabriel.marinoja
2025-09-23 13:18:25 +02:00
commit f0e1453d13
9 changed files with 13088 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
MLvenv/**

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 59 KiB

View File

@@ -0,0 +1,481 @@
Gender,Married,Education,TotalIncome,LoanAmount,CreditHistory,LoanStatus
Male,Yes,Graduate,6091.0,128.0,1.0,N
Male,Yes,Graduate,3000.0,66.0,1.0,Y
Male,Yes,Not Graduate,4941.0,120.0,1.0,Y
Male,No,Graduate,6000.0,141.0,1.0,Y
Male,Yes,Graduate,9613.0,267.0,1.0,Y
Male,Yes,Not Graduate,3849.0,95.0,1.0,Y
Male,Yes,Graduate,5540.0,158.0,0.0,N
Male,Yes,Graduate,5532.0,168.0,1.0,Y
Male,Yes,Graduate,23809.0,349.0,1.0,N
Male,Yes,Graduate,3900.0,70.0,1.0,Y
Male,Yes,Graduate,11179.0,200.0,1.0,Y
Male,No,Graduate,4693.0,114.0,1.0,N
Male,Yes,Graduate,2385.0,17.0,1.0,Y
Male,No,Graduate,4950.0,125.0,1.0,Y
Female,No,Graduate,3510.0,76.0,0.0,N
Male,Yes,Not Graduate,4887.0,133.0,1.0,N
Male,Yes,Not Graduate,7660.0,104.0,0.0,N
Male,Yes,Graduate,11580.0,315.0,1.0,Y
Male,Yes,Not Graduate,4511.0,116.0,0.0,N
Male,Yes,Graduate,9560.0,191.0,1.0,Y
Male,Yes,Graduate,5052.0,122.0,1.0,Y
Male,Yes,Not Graduate,5266.0,110.0,1.0,Y
Male,No,Not Graduate,1442.0,35.0,1.0,N
Male,No,Graduate,3167.0,74.0,1.0,N
Male,No,Graduate,4692.0,106.0,1.0,N
Male,Yes,Graduate,5167.0,114.0,1.0,Y
Male,No,Graduate,15500.0,320.0,1.0,N
Female,Yes,Graduate,5126.0,144.0,1.0,Y
Male,No,Graduate,11376.0,184.0,1.0,Y
Male,No,Not Graduate,5416.0,110.0,1.0,Y
Male,No,Graduate,3600.0,80.0,1.0,N
Male,No,Graduate,3013.0,47.0,1.0,Y
Male,Yes,Graduate,6277.0,134.0,1.0,Y
Male,Yes,Graduate,5649.0,44.0,1.0,Y
Male,Yes,Graduate,5821.0,144.0,1.0,Y
Female,Yes,Graduate,6085.0,120.0,0.0,N
Female,No,Graduate,6275.0,144.0,1.0,Y
Female,Yes,Not Graduate,3572.0,100.0,1.0,Y
Female,No,Graduate,3086.0,120.0,1.0,Y
Female,No,Graduate,4230.0,112.0,1.0,N
Male,Yes,Graduate,4616.0,134.0,1.0,N
Female,Yes,Graduate,11500.0,286.0,0.0,N
Male,Yes,Graduate,3875.0,97.0,1.0,Y
Male,Yes,Graduate,3723.0,96.0,1.0,Y
Male,Yes,Graduate,5566.0,135.0,1.0,N
Male,Yes,Graduate,10330.0,180.0,1.0,Y
Male,Yes,Not Graduate,6216.0,144.0,1.0,Y
Male,Yes,Graduate,6296.0,120.0,1.0,Y
Male,Yes,Graduate,3029.0,99.0,1.0,Y
Male,Yes,Not Graduate,6058.0,165.0,0.0,N
Female,No,Graduate,4166.0,116.0,0.0,N
Male,Yes,Graduate,10321.0,258.0,1.0,N
Male,No,Not Graduate,5454.0,126.0,0.0,N
Male,Yes,Graduate,10750.0,312.0,1.0,Y
Male,Yes,Not Graduate,7100.0,125.0,1.0,Y
Female,No,Graduate,4300.0,136.0,0.0,N
Male,Yes,Graduate,6274.0,172.0,1.0,Y
Male,Yes,Not Graduate,3750.0,97.0,1.0,Y
Male,No,Graduate,3500.0,81.0,1.0,Y
Male,Yes,Graduate,7040.0,187.0,1.0,Y
Male,No,Graduate,3750.0,113.0,1.0,N
Male,No,Graduate,8500.0,176.0,1.0,N
Male,Yes,Graduate,4022.0,110.0,1.0,N
Male,Yes,Graduate,7167.0,180.0,0.0,N
Female,No,Graduate,3846.0,111.0,1.0,Y
Female,Yes,Graduate,3259.0,167.0,1.0,N
Male,Yes,Graduate,3988.0,50.0,1.0,Y
Male,No,Graduate,4897.0,136.0,1.0,Y
Male,Yes,Graduate,4618.0,104.0,1.0,Y
Male,No,Graduate,8566.0,210.0,1.0,Y
Male,Yes,Graduate,9862.0,175.0,1.0,Y
Male,Yes,Graduate,5858.0,131.0,1.0,Y
Male,Yes,Graduate,11904.0,188.0,1.0,Y
Male,Yes,Not Graduate,5093.0,81.0,1.0,Y
Male,No,Graduate,4133.0,122.0,1.0,Y
Male,No,Not Graduate,3620.0,25.0,1.0,Y
Female,Yes,Graduate,4786.0,137.0,1.0,Y
Male,Yes,Graduate,2974.0,50.0,1.0,Y
Male,Yes,Not Graduate,4188.0,115.0,1.0,Y
Male,Yes,Graduate,5300.0,131.0,1.0,Y
Male,Yes,Not Graduate,7551.0,133.0,1.0,Y
Male,No,Graduate,8649.0,151.0,1.0,Y
Male,Yes,Graduate,4082.0,100.0,1.0,Y
Male,Yes,Graduate,12543.0,225.0,1.0,Y
Male,Yes,Graduate,7400.0,216.0,0.0,N
Male,Yes,Not Graduate,2825.0,94.0,1.0,Y
Male,No,Graduate,5316.0,136.0,1.0,Y
Male,Yes,Graduate,14583.0,185.0,1.0,Y
Female,Yes,Graduate,5450.0,154.0,1.0,Y
Male,Yes,Graduate,7710.0,175.0,1.0,N
Female,No,Graduate,10408.0,259.0,1.0,Y
Female,No,Graduate,4166.0,44.0,1.0,Y
Female,No,Graduate,11117.0,137.0,0.0,Y
Male,Yes,Graduate,2957.0,81.0,1.0,Y
Male,Yes,Not Graduate,6314.0,194.0,1.0,Y
Male,Yes,Graduate,14363.0,160.0,0.0,N
Male,No,Graduate,3943.0,74.0,1.0,Y
Male,No,Graduate,2718.0,70.0,1.0,Y
Male,Yes,Graduate,3459.0,25.0,1.0,Y
Male,No,Graduate,4895.0,102.0,1.0,Y
Male,Yes,Graduate,11750.0,290.0,1.0,N
Female,Yes,Graduate,4583.0,84.0,1.0,N
Male,Yes,Graduate,6816.0,88.0,1.0,Y
Male,No,Graduate,14999.0,242.0,0.0,N
Male,Yes,Not Graduate,5630.0,129.0,1.0,N
Male,Yes,Graduate,7125.0,185.0,1.0,N
Male,No,Graduate,5417.0,168.0,1.0,Y
Male,No,Graduate,6950.0,175.0,1.0,Y
Male,Yes,Graduate,4732.0,122.0,1.0,Y
Male,Yes,Graduate,11757.0,187.0,1.0,Y
Female,Yes,Graduate,6816.0,100.0,1.0,Y
Female,Yes,Graduate,14866.0,70.0,1.0,Y
Male,Yes,Graduate,2963.0,30.0,1.0,Y
Female,No,Graduate,11666.0,225.0,1.0,N
Male,Yes,Graduate,5690.0,125.0,1.0,Y
Male,No,Graduate,6277.0,118.0,0.0,N
Male,Yes,Graduate,6327.0,152.0,1.0,Y
Male,No,Graduate,9166.0,244.0,1.0,N
Male,Yes,Not Graduate,2281.0,113.0,1.0,N
Male,No,Graduate,3254.0,50.0,1.0,Y
Male,Yes,Graduate,39999.0,600.0,0.0,Y
Male,Yes,Graduate,9538.0,187.0,1.0,Y
Male,Yes,Graduate,10208.0,255.0,1.0,Y
Male,Yes,Not Graduate,2904.0,98.0,1.0,Y
Male,Yes,Graduate,7933.0,275.0,1.0,N
Male,Yes,Graduate,4369.0,121.0,0.0,N
Male,Yes,Graduate,5614.0,158.0,1.0,Y
Male,Yes,Graduate,9323.0,75.0,1.0,Y
Female,Yes,Graduate,4583.0,112.0,1.0,N
Male,Yes,Graduate,5772.0,129.0,1.0,Y
Male,No,Graduate,2237.0,63.0,0.0,N
Male,Yes,Graduate,8000.0,200.0,1.0,Y
Male,Yes,Not Graduate,3522.0,81.0,1.0,N
Male,Yes,Graduate,11333.0,187.0,1.0,Y
Male,Yes,Not Graduate,5080.0,87.0,1.0,N
Male,Yes,Graduate,5461.0,116.0,1.0,Y
Male,Yes,Graduate,3664.0,101.0,1.0,Y
Male,Yes,Graduate,16816.0,495.0,0.0,N
Male,Yes,Graduate,3750.0,116.0,1.0,Y
Male,No,Not Graduate,3784.0,102.0,0.0,N
Male,Yes,Graduate,13650.0,180.0,0.0,N
Male,Yes,Graduate,4600.0,73.0,1.0,Y
Male,Yes,Graduate,33846.0,260.0,1.0,N
Female,Yes,Graduate,3625.0,108.0,1.0,Y
Male,Yes,Graduate,43897.0,120.0,1.0,Y
Male,Yes,Graduate,2178.0,66.0,0.0,N
Male,Yes,Graduate,9328.0,188.0,1.0,Y
Male,No,Not Graduate,4885.0,48.0,1.0,Y
Male,No,Graduate,12000.0,164.0,1.0,N
Male,Yes,Not Graduate,6033.0,160.0,1.0,N
Male,No,Graduate,3858.0,76.0,1.0,Y
Male,No,Graduate,4191.0,120.0,1.0,Y
Male,Yes,Graduate,5708.0,170.0,1.0,N
Male,No,Graduate,12083.0,187.0,1.0,Y
Male,No,Graduate,11000.0,83.0,1.0,N
Male,Yes,Not Graduate,5100.0,90.0,1.0,Y
Male,No,Graduate,4923.0,166.0,0.0,Y
Male,Yes,Not Graduate,4583.0,135.0,1.0,Y
Male,Yes,Not Graduate,3917.0,124.0,1.0,Y
Female,No,Not Graduate,4408.0,120.0,1.0,Y
Female,No,Graduate,3244.0,80.0,1.0,Y
Male,No,Not Graduate,6506.0,55.0,1.0,Y
Male,No,Graduate,2479.0,59.0,1.0,Y
Male,No,Graduate,3418.0,127.0,1.0,N
Female,No,Graduate,10000.0,214.0,1.0,N
Male,Yes,Graduate,4680.0,128.0,0.0,N
Male,Yes,Graduate,7787.0,240.0,1.0,Y
Male,Yes,Not Graduate,5703.0,130.0,1.0,Y
Male,Yes,Graduate,6194.0,137.0,1.0,Y
Male,Yes,Not Graduate,4833.0,100.0,1.0,Y
Male,Yes,Graduate,1950.0,135.0,1.0,N
Male,Yes,Graduate,5502.0,131.0,1.0,Y
Male,Yes,Graduate,2221.0,60.0,0.0,N
Male,Yes,Graduate,5726.0,116.0,1.0,Y
Male,No,Graduate,5762.0,144.0,1.0,Y
Male,Yes,Graduate,6250.0,128.0,1.0,Y
Male,Yes,Graduate,3250.0,170.0,1.0,N
Male,Yes,Graduate,7945.0,210.0,1.0,Y
Male,No,Graduate,6400.0,200.0,1.0,Y
Male,Yes,Graduate,4545.0,104.0,1.0,Y
Female,No,Graduate,8333.0,280.0,1.0,Y
Male,Yes,Graduate,4934.0,140.0,1.0,Y
Male,Yes,Graduate,6760.0,170.0,1.0,Y
Female,No,Graduate,3812.0,112.0,1.0,Y
Male,Yes,Graduate,3315.0,96.0,1.0,Y
Male,Yes,Graduate,10819.0,120.0,1.0,Y
Male,Yes,Not Graduate,4493.0,140.0,1.0,N
Male,No,Graduate,8666.0,155.0,1.0,Y
Male,Yes,Graduate,7550.0,108.0,1.0,Y
Male,Yes,Not Graduate,7823.0,123.0,1.0,Y
Male,No,Graduate,10383.0,120.0,1.0,N
Male,Yes,Graduate,9703.0,112.0,1.0,Y
Male,Yes,Not Graduate,6608.0,137.0,1.0,Y
Male,Yes,Graduate,4725.0,123.0,1.0,Y
Male,Yes,Graduate,3677.0,90.0,1.0,Y
Male,Yes,Not Graduate,5558.0,201.0,0.0,N
Female,No,Graduate,3427.0,138.0,1.0,N
Male,No,Not Graduate,4750.0,104.0,1.0,Y
Male,Yes,Not Graduate,9762.0,279.0,1.0,Y
Male,No,Graduate,16250.0,192.0,0.0,N
Female,No,Graduate,3083.0,255.0,1.0,Y
Male,No,Not Graduate,6045.0,115.0,0.0,N
Male,Yes,Graduate,5250.0,94.0,1.0,N
Male,Yes,Graduate,16783.0,304.0,1.0,N
Male,No,Graduate,4269.0,134.0,1.0,Y
Female,No,Graduate,3481.0,155.0,1.0,N
Female,No,Graduate,7200.0,120.0,1.0,Y
Male,No,Graduate,5166.0,128.0,1.0,Y
Male,No,Graduate,7542.0,151.0,1.0,Y
Male,Yes,Graduate,6095.0,150.0,1.0,Y
Male,Yes,Graduate,6144.0,160.0,0.0,Y
Female,No,Graduate,4436.0,90.0,1.0,Y
Female,No,Graduate,3237.0,30.0,1.0,Y
Male,Yes,Graduate,11146.0,136.0,1.0,Y
Male,No,Graduate,4690.0,126.0,1.0,Y
Male,Yes,Graduate,4843.0,150.0,1.0,Y
Male,Yes,Graduate,3900.0,90.0,1.0,Y
Male,Yes,Graduate,4592.0,115.0,1.0,Y
Male,Yes,Graduate,7267.0,207.0,1.0,Y
Male,Yes,Graduate,4403.0,80.0,1.0,Y
Male,Yes,Graduate,14583.0,436.0,1.0,Y
Male,No,Not Graduate,6479.0,158.0,0.0,N
Male,Yes,Graduate,4727.0,112.0,1.0,Y
Male,Yes,Graduate,3286.7999878,78.0,1.0,Y
Female,No,Graduate,3477.0,54.0,1.0,Y
Male,No,Graduate,6211.0,89.0,1.0,Y
Female,No,Graduate,4317.0,99.0,1.0,N
Male,Yes,Graduate,5704.0,120.0,1.0,Y
Female,No,Graduate,4124.0,115.0,1.0,Y
Male,No,Graduate,9508.0,187.0,1.0,Y
Male,Yes,Graduate,5491.0,139.0,1.0,Y
Male,Yes,Graduate,4400.0,127.0,0.0,N
Male,Yes,Graduate,4713.0,134.0,1.0,Y
Male,Yes,Graduate,5717.0,172.0,1.0,Y
Male,Yes,Graduate,6875.0,200.0,1.0,Y
Female,Yes,Graduate,4666.0,135.0,1.0,Y
Female,No,Graduate,7541.0,151.0,1.0,N
Male,Yes,Graduate,4939.0,113.0,1.0,N
Male,Yes,Not Graduate,4734.0,93.0,0.0,N
Female,No,Graduate,5000.0,132.0,1.0,Y
Male,Yes,Graduate,3428.0,96.0,1.0,Y
Male,No,Graduate,6500.0,140.0,1.0,Y
Female,No,Graduate,5428.0,135.0,1.0,Y
Female,No,Graduate,4263.0,104.0,0.0,N
Male,No,Graduate,20233.0,480.0,1.0,N
Female,No,Graduate,2917.0,84.0,1.0,Y
Male,No,Not Graduate,5332.0,111.0,1.0,Y
Female,No,Graduate,2507.0,56.0,1.0,Y
Male,Yes,Not Graduate,5039.0,111.0,1.0,Y
Male,Yes,Graduate,3717.0,120.0,1.0,Y
Male,Yes,Graduate,10000.0,155.0,1.0,N
Male,Yes,Graduate,4567.0,115.0,1.0,Y
Male,Yes,Graduate,4531.0,124.0,1.0,Y
Male,Yes,Graduate,15000.0,300.0,1.0,Y
Male,Yes,Graduate,13649.0,376.0,0.0,N
Male,No,Graduate,4917.0,130.0,0.0,Y
Male,Yes,Graduate,7978.0,184.0,1.0,Y
Female,Yes,Graduate,6784.0,110.0,1.0,N
Female,No,Graduate,2500.0,67.0,1.0,Y
Male,No,Graduate,6177.0,117.0,1.0,Y
Male,No,Graduate,2935.0,98.0,1.0,Y
Male,Yes,Graduate,7100.0,176.0,1.0,Y
Female,No,Graduate,4160.0,71.0,1.0,Y
Male,Yes,Not Graduate,4234.0,173.0,1.0,N
Female,No,Graduate,2378.0,46.0,1.0,N
Male,Yes,Not Graduate,5783.0,158.0,1.0,Y
Male,Yes,Not Graduate,3173.0,74.0,1.0,Y
Male,Yes,Graduate,4957.0,160.0,1.0,Y
Male,Yes,Not Graduate,5251.0,126.0,1.0,Y
Male,Yes,Graduate,8875.0,187.0,1.0,Y
Male,Yes,Graduate,9083.0,228.0,1.0,Y
Male,No,Graduate,12917.0,308.0,1.0,N
Male,Yes,Graduate,4749.0,95.0,1.0,Y
Female,Yes,Graduate,5500.0,105.0,0.0,N
Female,Yes,Graduate,2928.0,130.0,1.0,Y
Male,Yes,Graduate,11500.0,165.0,1.0,Y
Male,Yes,Graduate,3875.0,67.0,1.0,N
Male,Yes,Not Graduate,4666.0,100.0,0.0,N
Male,Yes,Graduate,8334.0,200.0,1.0,Y
Female,No,Graduate,4723.0,81.0,1.0,N
Male,Yes,Graduate,8667.0,236.0,1.0,Y
Male,Yes,Graduate,7083.0,130.0,1.0,Y
Male,No,Graduate,6822.0,141.0,1.0,Y
Male,No,Not Graduate,6216.0,133.0,1.0,N
Male,No,Graduate,2500.0,96.0,1.0,N
Male,Yes,Graduate,6325.0,175.0,1.0,Y
Male,Yes,Graduate,24996.0,570.0,1.0,N
Female,No,Graduate,15759.0,55.0,1.0,Y
Male,Yes,Graduate,5185.0,155.0,1.0,Y
Male,Yes,Graduate,17196.0,380.0,1.0,Y
Male,No,Graduate,5049.0,111.0,0.0,N
Male,Yes,Graduate,5740.0,120.0,1.0,Y
Male,Yes,Graduate,13746.0,130.0,1.0,Y
Male,No,Graduate,3069.0,71.0,1.0,N
Male,Yes,Graduate,5391.0,130.0,1.0,Y
Male,No,Graduate,10173.0,296.0,1.0,Y
Female,No,Graduate,6000.0,156.0,1.0,Y
Male,No,Graduate,7167.0,128.0,1.0,Y
Male,Yes,Graduate,4566.0,100.0,1.0,N
Male,No,Not Graduate,3946.0,132.0,1.0,Y
Male,Yes,Graduate,4750.0,136.0,1.0,Y
Male,Yes,Graduate,5488.0,125.0,1.0,Y
Male,No,Graduate,9167.0,185.0,1.0,Y
Male,Yes,Graduate,9504.0,275.0,1.0,Y
Male,Yes,Not Graduate,3618.0,113.0,1.0,Y
Male,Yes,Graduate,4500.0,113.0,1.0,Y
Female,No,Graduate,3180.0,71.0,0.0,N
Male,Yes,Graduate,4492.0,95.0,1.0,Y
Male,No,Not Graduate,5568.0,109.0,1.0,Y
Female,No,Graduate,3300.0,103.0,0.0,N
Male,Yes,Not Graduate,2889.0,45.0,0.0,N
Male,No,Not Graduate,2755.0,65.0,1.0,N
Male,No,Graduate,22500.0,103.0,1.0,Y
Female,No,Not Graduate,1963.0,53.0,1.0,Y
Female,No,Graduate,7441.0,194.0,1.0,N
Female,No,Graduate,4547.0,115.0,1.0,Y
Male,Yes,Not Graduate,4567.0,115.0,1.0,Y
Female,No,Not Graduate,2213.0,66.0,1.0,Y
Male,Yes,Graduate,8300.0,152.0,0.0,N
Male,Yes,Graduate,81000.0,360.0,0.0,N
Female,No,Not Graduate,3867.0,62.0,1.0,N
Male,Yes,Not Graduate,6096.0,218.0,0.0,N
Male,Yes,Not Graduate,4286.0,110.0,1.0,Y
Female,Yes,Not Graduate,5386.0,178.0,0.0,N
Female,No,Graduate,2995.0,60.0,1.0,Y
Female,No,Graduate,2600.0,160.0,1.0,N
Male,Yes,Graduate,21600.0,239.0,1.0,N
Male,Yes,Graduate,3798.0,112.0,1.0,Y
Male,Yes,Graduate,4663.0,138.0,1.0,Y
Male,Yes,Graduate,5829.0,138.0,1.0,Y
Male,Yes,Graduate,3539.0,100.0,1.0,Y
Male,Yes,Graduate,14880.0,96.0,1.0,Y
Male,Yes,Graduate,6966.0,121.0,1.0,Y
Female,No,Not Graduate,4606.0,81.0,1.0,N
Male,Yes,Graduate,5935.0,133.0,1.0,Y
Male,Yes,Graduate,2936.12000084,87.0,1.0,Y
Male,No,Not Graduate,2717.0,60.0,1.0,Y
Female,No,Graduate,8624.0,150.0,1.0,Y
Male,No,Graduate,6500.0,105.0,0.0,N
Male,Yes,Graduate,4765.0,143.0,1.0,Y
Male,No,Graduate,3750.0,100.0,1.0,Y
Male,No,Graduate,3777.0,50.0,1.0,Y
Male,No,Graduate,10416.0,187.0,0.0,N
Female,Yes,Not Graduate,7142.0,138.0,1.0,Y
Male,No,Graduate,8724.0,187.0,1.0,Y
Male,Yes,Graduate,9734.0,180.0,1.0,Y
Male,No,Not Graduate,6700.0,148.0,1.0,Y
Male,No,Graduate,37719.0,152.0,1.0,Y
Male,Yes,Graduate,4676.0,130.0,1.0,Y
Male,Yes,Not Graduate,4652.0,110.0,1.0,Y
Male,Yes,Graduate,5050.0,150.0,0.0,N
Male,Yes,Not Graduate,3564.0,125.0,0.0,N
Male,Yes,Graduate,5681.0,149.0,0.0,N
Male,Yes,Graduate,4949.0,90.0,0.0,Y
Male,No,Graduate,7085.0,84.0,1.0,Y
Male,Yes,Graduate,3859.0,96.0,1.0,Y
Male,Yes,Graduate,4301.0,118.0,1.0,Y
Male,Yes,Graduate,6277.0,173.0,1.0,N
Male,No,Graduate,4354.0,136.0,1.0,Y
Male,Yes,Graduate,8334.0,160.0,1.0,N
Male,Yes,Graduate,7740.0,128.0,1.0,Y
Male,Yes,Graduate,5203.0,153.0,1.0,Y
Male,No,Graduate,4166.0,98.0,0.0,N
Male,No,Graduate,6000.0,140.0,1.0,Y
Male,Yes,Not Graduate,4611.0,70.0,0.0,N
Male,Yes,Graduate,6784.0,110.0,1.0,N
Male,Yes,Graduate,5529.0,162.0,1.0,Y
Male,Yes,Not Graduate,4153.0,113.0,0.0,N
Male,Yes,Graduate,4691.0,100.0,1.0,Y
Male,No,Graduate,10180.0,162.0,1.0,Y
Male,Yes,Graduate,17539.0,150.0,1.0,Y
Male,Yes,Graduate,8450.0,230.0,1.0,Y
Male,Yes,Graduate,18917.0,86.0,1.0,Y
Female,No,Not Graduate,4350.0,154.0,1.0,Y
Male,Yes,Not Graduate,3095.0,113.0,1.0,Y
Male,Yes,Graduate,5233.0,128.0,1.0,Y
Male,Yes,Graduate,10833.0,234.0,1.0,Y
Male,Yes,Graduate,8333.0,246.0,1.0,Y
Male,Yes,Not Graduate,4394.0,131.0,1.0,Y
Male,No,Graduate,3547.0,80.0,0.0,N
Male,Yes,Graduate,18333.0,500.0,1.0,N
Male,Yes,Graduate,6666.0,160.0,1.0,Y
Male,No,Graduate,2435.0,75.0,1.0,N
Male,No,Not Graduate,3691.0,110.0,1.0,Y
Female,No,Not Graduate,17263.0,225.0,1.0,Y
Male,Yes,Graduate,5754.0,119.0,0.0,N
Female,Yes,Graduate,4239.0,105.0,1.0,Y
Male,Yes,Not Graduate,4300.0,107.0,1.0,Y
Male,Yes,Graduate,2895.0,95.0,1.0,Y
Male,No,Graduate,10699.0,209.0,0.0,N
Female,No,Graduate,4328.0,113.0,1.0,Y
Female,No,Graduate,3159.0,100.0,1.0,Y
Male,Yes,Graduate,10489.0,208.0,1.0,Y
Male,Yes,Not Graduate,5297.0,124.0,1.0,Y
Male,Yes,Graduate,7926.0,243.0,1.0,Y
Male,Yes,Graduate,5492.0,188.0,1.0,Y
Female,No,Graduate,13262.0,40.0,1.0,Y
Male,No,Not Graduate,4885.0,100.0,1.0,N
Male,Yes,Graduate,8069.0,250.0,1.0,Y
Male,Yes,Graduate,5318.0,148.0,1.0,Y
Male,Yes,Graduate,8796.0,70.0,1.0,N
Male,No,Graduate,9481.0,311.0,1.0,N
Male,Yes,Graduate,6894.0,150.0,1.0,Y
Female,Yes,Graduate,3663.0,113.0,1.0,Y
Male,No,Graduate,6598.0,185.0,1.0,N
Female,No,Not Graduate,3400.0,95.0,1.0,N
Male,Yes,Not Graduate,3934.0,45.0,1.0,Y
Male,No,Graduate,2500.0,55.0,1.0,Y
Male,Yes,Graduate,7101.0,100.0,1.0,Y
Male,Yes,Graduate,15114.0,480.0,1.0,Y
Male,Yes,Graduate,17500.0,400.0,1.0,Y
Male,Yes,Graduate,3775.0,110.0,1.0,Y
Male,Yes,Not Graduate,6715.0,161.0,0.0,Y
Male,No,Not Graduate,3981.0,94.0,1.0,Y
Male,No,Not Graduate,6783.0,130.0,1.0,Y
Male,Yes,Graduate,4281.0,100.0,1.0,Y
Male,No,Graduate,3588.0,110.0,0.0,N
Female,No,Not Graduate,18165.0,125.0,1.0,Y
Male,Yes,Graduate,10039.0,324.0,1.0,Y
Male,No,Graduate,3617.0,107.0,1.0,Y
Male,Yes,Not Graduate,3453.0,66.0,1.0,N
Male,Yes,Graduate,6417.0,157.0,1.0,Y
Female,Yes,Graduate,7453.0,140.0,1.0,Y
Female,No,Graduate,2138.0,99.0,0.0,N
Male,Yes,Not Graduate,4763.0,128.0,1.0,Y
Male,Yes,Graduate,4718.0,155.0,1.0,Y
Male,No,Not Graduate,3358.0,80.0,1.0,N
Male,No,Graduate,4309.0,145.0,1.0,Y
Female,No,Graduate,5000.0,103.0,0.0,N
Male,Yes,Graduate,4801.0,110.0,1.0,Y
Male,Yes,Graduate,6583.0,158.0,1.0,Y
Male,Yes,Not Graduate,4787.0,181.0,0.0,N
Male,Yes,Graduate,7859.0,132.0,0.0,N
Male,Yes,Graduate,6500.0,26.0,1.0,Y
Male,Yes,Graduate,10139.0,260.0,1.0,Y
Male,Yes,Graduate,6556.0,162.0,1.0,Y
Female,Yes,Graduate,6486.0,182.0,1.0,Y
Male,Yes,Not Graduate,3917.0,108.0,1.0,Y
Female,Yes,Graduate,19484.0,600.0,1.0,Y
Male,Yes,Graduate,7977.0,211.0,1.0,Y
Male,No,Not Graduate,5800.0,132.0,1.0,Y
Male,Yes,Graduate,8799.0,258.0,0.0,N
Male,No,Graduate,3333.0,70.0,1.0,Y
Male,Yes,Graduate,5900.0,123.0,0.0,N
Female,No,Graduate,2378.0,9.0,1.0,N
Male,Yes,Graduate,5230.0,104.0,0.0,N
Male,Yes,Graduate,5167.0,186.0,1.0,Y
Male,Yes,Graduate,16666.0,275.0,1.0,Y
Male,Yes,Not Graduate,7750.0,187.0,1.0,N
Male,Yes,Graduate,6406.0,150.0,1.0,N
Male,Yes,Graduate,3620.0,108.0,1.0,Y
Male,No,Graduate,5968.0,110.0,1.0,Y
Male,Yes,Graduate,4014.0,107.0,1.0,Y
Male,Yes,Graduate,6540.0,205.0,1.0,Y
Male,No,Graduate,35673.0,90.0,1.0,N
Female,Yes,Graduate,3166.0,36.0,1.0,Y
Male,Yes,Graduate,4704.0,146.0,0.0,N
Male,Yes,Graduate,7283.0,172.0,1.0,N
Male,Yes,Graduate,3819.0,104.0,1.0,Y
Female,No,Not Graduate,2165.0,70.0,1.0,Y
Male,Yes,Graduate,2726.0,106.0,0.0,N
Male,Yes,Graduate,6416.0,56.0,1.0,Y
Male,Yes,Graduate,6000.0,205.0,1.0,N
Male,Yes,Graduate,7159.0,142.0,1.0,Y
Male,Yes,Graduate,16120.0,260.0,1.0,Y
Male,No,Not Graduate,3833.0,110.0,1.0,Y
Male,Yes,Not Graduate,7383.0,187.0,1.0,N
Male,Yes,Graduate,9963.0,180.0,1.0,Y
Male,Yes,Graduate,5780.0,192.0,1.0,Y
Male,Yes,Graduate,5703.0,128.0,1.0,Y
Male,No,Graduate,7977.0,172.0,1.0,Y
Female,Yes,Graduate,12000.0,496.0,1.0,Y
Male,Yes,Graduate,5900.0,173.0,1.0,Y
Male,Yes,Not Graduate,5398.0,157.0,1.0,Y
Male,Yes,Graduate,5182.0,108.0,1.0,Y
Female,No,Graduate,2900.0,71.0,1.0,Y
Male,Yes,Graduate,4106.0,40.0,1.0,Y
Male,Yes,Graduate,8312.0,253.0,1.0,Y
Male,Yes,Graduate,7583.0,187.0,1.0,Y
Female,No,Graduate,4583.0,133.0,0.0,N
1 Gender Married Education TotalIncome LoanAmount CreditHistory LoanStatus
2 Male Yes Graduate 6091.0 128.0 1.0 N
3 Male Yes Graduate 3000.0 66.0 1.0 Y
4 Male Yes Not Graduate 4941.0 120.0 1.0 Y
5 Male No Graduate 6000.0 141.0 1.0 Y
6 Male Yes Graduate 9613.0 267.0 1.0 Y
7 Male Yes Not Graduate 3849.0 95.0 1.0 Y
8 Male Yes Graduate 5540.0 158.0 0.0 N
9 Male Yes Graduate 5532.0 168.0 1.0 Y
10 Male Yes Graduate 23809.0 349.0 1.0 N
11 Male Yes Graduate 3900.0 70.0 1.0 Y
12 Male Yes Graduate 11179.0 200.0 1.0 Y
13 Male No Graduate 4693.0 114.0 1.0 N
14 Male Yes Graduate 2385.0 17.0 1.0 Y
15 Male No Graduate 4950.0 125.0 1.0 Y
16 Female No Graduate 3510.0 76.0 0.0 N
17 Male Yes Not Graduate 4887.0 133.0 1.0 N
18 Male Yes Not Graduate 7660.0 104.0 0.0 N
19 Male Yes Graduate 11580.0 315.0 1.0 Y
20 Male Yes Not Graduate 4511.0 116.0 0.0 N
21 Male Yes Graduate 9560.0 191.0 1.0 Y
22 Male Yes Graduate 5052.0 122.0 1.0 Y
23 Male Yes Not Graduate 5266.0 110.0 1.0 Y
24 Male No Not Graduate 1442.0 35.0 1.0 N
25 Male No Graduate 3167.0 74.0 1.0 N
26 Male No Graduate 4692.0 106.0 1.0 N
27 Male Yes Graduate 5167.0 114.0 1.0 Y
28 Male No Graduate 15500.0 320.0 1.0 N
29 Female Yes Graduate 5126.0 144.0 1.0 Y
30 Male No Graduate 11376.0 184.0 1.0 Y
31 Male No Not Graduate 5416.0 110.0 1.0 Y
32 Male No Graduate 3600.0 80.0 1.0 N
33 Male No Graduate 3013.0 47.0 1.0 Y
34 Male Yes Graduate 6277.0 134.0 1.0 Y
35 Male Yes Graduate 5649.0 44.0 1.0 Y
36 Male Yes Graduate 5821.0 144.0 1.0 Y
37 Female Yes Graduate 6085.0 120.0 0.0 N
38 Female No Graduate 6275.0 144.0 1.0 Y
39 Female Yes Not Graduate 3572.0 100.0 1.0 Y
40 Female No Graduate 3086.0 120.0 1.0 Y
41 Female No Graduate 4230.0 112.0 1.0 N
42 Male Yes Graduate 4616.0 134.0 1.0 N
43 Female Yes Graduate 11500.0 286.0 0.0 N
44 Male Yes Graduate 3875.0 97.0 1.0 Y
45 Male Yes Graduate 3723.0 96.0 1.0 Y
46 Male Yes Graduate 5566.0 135.0 1.0 N
47 Male Yes Graduate 10330.0 180.0 1.0 Y
48 Male Yes Not Graduate 6216.0 144.0 1.0 Y
49 Male Yes Graduate 6296.0 120.0 1.0 Y
50 Male Yes Graduate 3029.0 99.0 1.0 Y
51 Male Yes Not Graduate 6058.0 165.0 0.0 N
52 Female No Graduate 4166.0 116.0 0.0 N
53 Male Yes Graduate 10321.0 258.0 1.0 N
54 Male No Not Graduate 5454.0 126.0 0.0 N
55 Male Yes Graduate 10750.0 312.0 1.0 Y
56 Male Yes Not Graduate 7100.0 125.0 1.0 Y
57 Female No Graduate 4300.0 136.0 0.0 N
58 Male Yes Graduate 6274.0 172.0 1.0 Y
59 Male Yes Not Graduate 3750.0 97.0 1.0 Y
60 Male No Graduate 3500.0 81.0 1.0 Y
61 Male Yes Graduate 7040.0 187.0 1.0 Y
62 Male No Graduate 3750.0 113.0 1.0 N
63 Male No Graduate 8500.0 176.0 1.0 N
64 Male Yes Graduate 4022.0 110.0 1.0 N
65 Male Yes Graduate 7167.0 180.0 0.0 N
66 Female No Graduate 3846.0 111.0 1.0 Y
67 Female Yes Graduate 3259.0 167.0 1.0 N
68 Male Yes Graduate 3988.0 50.0 1.0 Y
69 Male No Graduate 4897.0 136.0 1.0 Y
70 Male Yes Graduate 4618.0 104.0 1.0 Y
71 Male No Graduate 8566.0 210.0 1.0 Y
72 Male Yes Graduate 9862.0 175.0 1.0 Y
73 Male Yes Graduate 5858.0 131.0 1.0 Y
74 Male Yes Graduate 11904.0 188.0 1.0 Y
75 Male Yes Not Graduate 5093.0 81.0 1.0 Y
76 Male No Graduate 4133.0 122.0 1.0 Y
77 Male No Not Graduate 3620.0 25.0 1.0 Y
78 Female Yes Graduate 4786.0 137.0 1.0 Y
79 Male Yes Graduate 2974.0 50.0 1.0 Y
80 Male Yes Not Graduate 4188.0 115.0 1.0 Y
81 Male Yes Graduate 5300.0 131.0 1.0 Y
82 Male Yes Not Graduate 7551.0 133.0 1.0 Y
83 Male No Graduate 8649.0 151.0 1.0 Y
84 Male Yes Graduate 4082.0 100.0 1.0 Y
85 Male Yes Graduate 12543.0 225.0 1.0 Y
86 Male Yes Graduate 7400.0 216.0 0.0 N
87 Male Yes Not Graduate 2825.0 94.0 1.0 Y
88 Male No Graduate 5316.0 136.0 1.0 Y
89 Male Yes Graduate 14583.0 185.0 1.0 Y
90 Female Yes Graduate 5450.0 154.0 1.0 Y
91 Male Yes Graduate 7710.0 175.0 1.0 N
92 Female No Graduate 10408.0 259.0 1.0 Y
93 Female No Graduate 4166.0 44.0 1.0 Y
94 Female No Graduate 11117.0 137.0 0.0 Y
95 Male Yes Graduate 2957.0 81.0 1.0 Y
96 Male Yes Not Graduate 6314.0 194.0 1.0 Y
97 Male Yes Graduate 14363.0 160.0 0.0 N
98 Male No Graduate 3943.0 74.0 1.0 Y
99 Male No Graduate 2718.0 70.0 1.0 Y
100 Male Yes Graduate 3459.0 25.0 1.0 Y
101 Male No Graduate 4895.0 102.0 1.0 Y
102 Male Yes Graduate 11750.0 290.0 1.0 N
103 Female Yes Graduate 4583.0 84.0 1.0 N
104 Male Yes Graduate 6816.0 88.0 1.0 Y
105 Male No Graduate 14999.0 242.0 0.0 N
106 Male Yes Not Graduate 5630.0 129.0 1.0 N
107 Male Yes Graduate 7125.0 185.0 1.0 N
108 Male No Graduate 5417.0 168.0 1.0 Y
109 Male No Graduate 6950.0 175.0 1.0 Y
110 Male Yes Graduate 4732.0 122.0 1.0 Y
111 Male Yes Graduate 11757.0 187.0 1.0 Y
112 Female Yes Graduate 6816.0 100.0 1.0 Y
113 Female Yes Graduate 14866.0 70.0 1.0 Y
114 Male Yes Graduate 2963.0 30.0 1.0 Y
115 Female No Graduate 11666.0 225.0 1.0 N
116 Male Yes Graduate 5690.0 125.0 1.0 Y
117 Male No Graduate 6277.0 118.0 0.0 N
118 Male Yes Graduate 6327.0 152.0 1.0 Y
119 Male No Graduate 9166.0 244.0 1.0 N
120 Male Yes Not Graduate 2281.0 113.0 1.0 N
121 Male No Graduate 3254.0 50.0 1.0 Y
122 Male Yes Graduate 39999.0 600.0 0.0 Y
123 Male Yes Graduate 9538.0 187.0 1.0 Y
124 Male Yes Graduate 10208.0 255.0 1.0 Y
125 Male Yes Not Graduate 2904.0 98.0 1.0 Y
126 Male Yes Graduate 7933.0 275.0 1.0 N
127 Male Yes Graduate 4369.0 121.0 0.0 N
128 Male Yes Graduate 5614.0 158.0 1.0 Y
129 Male Yes Graduate 9323.0 75.0 1.0 Y
130 Female Yes Graduate 4583.0 112.0 1.0 N
131 Male Yes Graduate 5772.0 129.0 1.0 Y
132 Male No Graduate 2237.0 63.0 0.0 N
133 Male Yes Graduate 8000.0 200.0 1.0 Y
134 Male Yes Not Graduate 3522.0 81.0 1.0 N
135 Male Yes Graduate 11333.0 187.0 1.0 Y
136 Male Yes Not Graduate 5080.0 87.0 1.0 N
137 Male Yes Graduate 5461.0 116.0 1.0 Y
138 Male Yes Graduate 3664.0 101.0 1.0 Y
139 Male Yes Graduate 16816.0 495.0 0.0 N
140 Male Yes Graduate 3750.0 116.0 1.0 Y
141 Male No Not Graduate 3784.0 102.0 0.0 N
142 Male Yes Graduate 13650.0 180.0 0.0 N
143 Male Yes Graduate 4600.0 73.0 1.0 Y
144 Male Yes Graduate 33846.0 260.0 1.0 N
145 Female Yes Graduate 3625.0 108.0 1.0 Y
146 Male Yes Graduate 43897.0 120.0 1.0 Y
147 Male Yes Graduate 2178.0 66.0 0.0 N
148 Male Yes Graduate 9328.0 188.0 1.0 Y
149 Male No Not Graduate 4885.0 48.0 1.0 Y
150 Male No Graduate 12000.0 164.0 1.0 N
151 Male Yes Not Graduate 6033.0 160.0 1.0 N
152 Male No Graduate 3858.0 76.0 1.0 Y
153 Male No Graduate 4191.0 120.0 1.0 Y
154 Male Yes Graduate 5708.0 170.0 1.0 N
155 Male No Graduate 12083.0 187.0 1.0 Y
156 Male No Graduate 11000.0 83.0 1.0 N
157 Male Yes Not Graduate 5100.0 90.0 1.0 Y
158 Male No Graduate 4923.0 166.0 0.0 Y
159 Male Yes Not Graduate 4583.0 135.0 1.0 Y
160 Male Yes Not Graduate 3917.0 124.0 1.0 Y
161 Female No Not Graduate 4408.0 120.0 1.0 Y
162 Female No Graduate 3244.0 80.0 1.0 Y
163 Male No Not Graduate 6506.0 55.0 1.0 Y
164 Male No Graduate 2479.0 59.0 1.0 Y
165 Male No Graduate 3418.0 127.0 1.0 N
166 Female No Graduate 10000.0 214.0 1.0 N
167 Male Yes Graduate 4680.0 128.0 0.0 N
168 Male Yes Graduate 7787.0 240.0 1.0 Y
169 Male Yes Not Graduate 5703.0 130.0 1.0 Y
170 Male Yes Graduate 6194.0 137.0 1.0 Y
171 Male Yes Not Graduate 4833.0 100.0 1.0 Y
172 Male Yes Graduate 1950.0 135.0 1.0 N
173 Male Yes Graduate 5502.0 131.0 1.0 Y
174 Male Yes Graduate 2221.0 60.0 0.0 N
175 Male Yes Graduate 5726.0 116.0 1.0 Y
176 Male No Graduate 5762.0 144.0 1.0 Y
177 Male Yes Graduate 6250.0 128.0 1.0 Y
178 Male Yes Graduate 3250.0 170.0 1.0 N
179 Male Yes Graduate 7945.0 210.0 1.0 Y
180 Male No Graduate 6400.0 200.0 1.0 Y
181 Male Yes Graduate 4545.0 104.0 1.0 Y
182 Female No Graduate 8333.0 280.0 1.0 Y
183 Male Yes Graduate 4934.0 140.0 1.0 Y
184 Male Yes Graduate 6760.0 170.0 1.0 Y
185 Female No Graduate 3812.0 112.0 1.0 Y
186 Male Yes Graduate 3315.0 96.0 1.0 Y
187 Male Yes Graduate 10819.0 120.0 1.0 Y
188 Male Yes Not Graduate 4493.0 140.0 1.0 N
189 Male No Graduate 8666.0 155.0 1.0 Y
190 Male Yes Graduate 7550.0 108.0 1.0 Y
191 Male Yes Not Graduate 7823.0 123.0 1.0 Y
192 Male No Graduate 10383.0 120.0 1.0 N
193 Male Yes Graduate 9703.0 112.0 1.0 Y
194 Male Yes Not Graduate 6608.0 137.0 1.0 Y
195 Male Yes Graduate 4725.0 123.0 1.0 Y
196 Male Yes Graduate 3677.0 90.0 1.0 Y
197 Male Yes Not Graduate 5558.0 201.0 0.0 N
198 Female No Graduate 3427.0 138.0 1.0 N
199 Male No Not Graduate 4750.0 104.0 1.0 Y
200 Male Yes Not Graduate 9762.0 279.0 1.0 Y
201 Male No Graduate 16250.0 192.0 0.0 N
202 Female No Graduate 3083.0 255.0 1.0 Y
203 Male No Not Graduate 6045.0 115.0 0.0 N
204 Male Yes Graduate 5250.0 94.0 1.0 N
205 Male Yes Graduate 16783.0 304.0 1.0 N
206 Male No Graduate 4269.0 134.0 1.0 Y
207 Female No Graduate 3481.0 155.0 1.0 N
208 Female No Graduate 7200.0 120.0 1.0 Y
209 Male No Graduate 5166.0 128.0 1.0 Y
210 Male No Graduate 7542.0 151.0 1.0 Y
211 Male Yes Graduate 6095.0 150.0 1.0 Y
212 Male Yes Graduate 6144.0 160.0 0.0 Y
213 Female No Graduate 4436.0 90.0 1.0 Y
214 Female No Graduate 3237.0 30.0 1.0 Y
215 Male Yes Graduate 11146.0 136.0 1.0 Y
216 Male No Graduate 4690.0 126.0 1.0 Y
217 Male Yes Graduate 4843.0 150.0 1.0 Y
218 Male Yes Graduate 3900.0 90.0 1.0 Y
219 Male Yes Graduate 4592.0 115.0 1.0 Y
220 Male Yes Graduate 7267.0 207.0 1.0 Y
221 Male Yes Graduate 4403.0 80.0 1.0 Y
222 Male Yes Graduate 14583.0 436.0 1.0 Y
223 Male No Not Graduate 6479.0 158.0 0.0 N
224 Male Yes Graduate 4727.0 112.0 1.0 Y
225 Male Yes Graduate 3286.7999878 78.0 1.0 Y
226 Female No Graduate 3477.0 54.0 1.0 Y
227 Male No Graduate 6211.0 89.0 1.0 Y
228 Female No Graduate 4317.0 99.0 1.0 N
229 Male Yes Graduate 5704.0 120.0 1.0 Y
230 Female No Graduate 4124.0 115.0 1.0 Y
231 Male No Graduate 9508.0 187.0 1.0 Y
232 Male Yes Graduate 5491.0 139.0 1.0 Y
233 Male Yes Graduate 4400.0 127.0 0.0 N
234 Male Yes Graduate 4713.0 134.0 1.0 Y
235 Male Yes Graduate 5717.0 172.0 1.0 Y
236 Male Yes Graduate 6875.0 200.0 1.0 Y
237 Female Yes Graduate 4666.0 135.0 1.0 Y
238 Female No Graduate 7541.0 151.0 1.0 N
239 Male Yes Graduate 4939.0 113.0 1.0 N
240 Male Yes Not Graduate 4734.0 93.0 0.0 N
241 Female No Graduate 5000.0 132.0 1.0 Y
242 Male Yes Graduate 3428.0 96.0 1.0 Y
243 Male No Graduate 6500.0 140.0 1.0 Y
244 Female No Graduate 5428.0 135.0 1.0 Y
245 Female No Graduate 4263.0 104.0 0.0 N
246 Male No Graduate 20233.0 480.0 1.0 N
247 Female No Graduate 2917.0 84.0 1.0 Y
248 Male No Not Graduate 5332.0 111.0 1.0 Y
249 Female No Graduate 2507.0 56.0 1.0 Y
250 Male Yes Not Graduate 5039.0 111.0 1.0 Y
251 Male Yes Graduate 3717.0 120.0 1.0 Y
252 Male Yes Graduate 10000.0 155.0 1.0 N
253 Male Yes Graduate 4567.0 115.0 1.0 Y
254 Male Yes Graduate 4531.0 124.0 1.0 Y
255 Male Yes Graduate 15000.0 300.0 1.0 Y
256 Male Yes Graduate 13649.0 376.0 0.0 N
257 Male No Graduate 4917.0 130.0 0.0 Y
258 Male Yes Graduate 7978.0 184.0 1.0 Y
259 Female Yes Graduate 6784.0 110.0 1.0 N
260 Female No Graduate 2500.0 67.0 1.0 Y
261 Male No Graduate 6177.0 117.0 1.0 Y
262 Male No Graduate 2935.0 98.0 1.0 Y
263 Male Yes Graduate 7100.0 176.0 1.0 Y
264 Female No Graduate 4160.0 71.0 1.0 Y
265 Male Yes Not Graduate 4234.0 173.0 1.0 N
266 Female No Graduate 2378.0 46.0 1.0 N
267 Male Yes Not Graduate 5783.0 158.0 1.0 Y
268 Male Yes Not Graduate 3173.0 74.0 1.0 Y
269 Male Yes Graduate 4957.0 160.0 1.0 Y
270 Male Yes Not Graduate 5251.0 126.0 1.0 Y
271 Male Yes Graduate 8875.0 187.0 1.0 Y
272 Male Yes Graduate 9083.0 228.0 1.0 Y
273 Male No Graduate 12917.0 308.0 1.0 N
274 Male Yes Graduate 4749.0 95.0 1.0 Y
275 Female Yes Graduate 5500.0 105.0 0.0 N
276 Female Yes Graduate 2928.0 130.0 1.0 Y
277 Male Yes Graduate 11500.0 165.0 1.0 Y
278 Male Yes Graduate 3875.0 67.0 1.0 N
279 Male Yes Not Graduate 4666.0 100.0 0.0 N
280 Male Yes Graduate 8334.0 200.0 1.0 Y
281 Female No Graduate 4723.0 81.0 1.0 N
282 Male Yes Graduate 8667.0 236.0 1.0 Y
283 Male Yes Graduate 7083.0 130.0 1.0 Y
284 Male No Graduate 6822.0 141.0 1.0 Y
285 Male No Not Graduate 6216.0 133.0 1.0 N
286 Male No Graduate 2500.0 96.0 1.0 N
287 Male Yes Graduate 6325.0 175.0 1.0 Y
288 Male Yes Graduate 24996.0 570.0 1.0 N
289 Female No Graduate 15759.0 55.0 1.0 Y
290 Male Yes Graduate 5185.0 155.0 1.0 Y
291 Male Yes Graduate 17196.0 380.0 1.0 Y
292 Male No Graduate 5049.0 111.0 0.0 N
293 Male Yes Graduate 5740.0 120.0 1.0 Y
294 Male Yes Graduate 13746.0 130.0 1.0 Y
295 Male No Graduate 3069.0 71.0 1.0 N
296 Male Yes Graduate 5391.0 130.0 1.0 Y
297 Male No Graduate 10173.0 296.0 1.0 Y
298 Female No Graduate 6000.0 156.0 1.0 Y
299 Male No Graduate 7167.0 128.0 1.0 Y
300 Male Yes Graduate 4566.0 100.0 1.0 N
301 Male No Not Graduate 3946.0 132.0 1.0 Y
302 Male Yes Graduate 4750.0 136.0 1.0 Y
303 Male Yes Graduate 5488.0 125.0 1.0 Y
304 Male No Graduate 9167.0 185.0 1.0 Y
305 Male Yes Graduate 9504.0 275.0 1.0 Y
306 Male Yes Not Graduate 3618.0 113.0 1.0 Y
307 Male Yes Graduate 4500.0 113.0 1.0 Y
308 Female No Graduate 3180.0 71.0 0.0 N
309 Male Yes Graduate 4492.0 95.0 1.0 Y
310 Male No Not Graduate 5568.0 109.0 1.0 Y
311 Female No Graduate 3300.0 103.0 0.0 N
312 Male Yes Not Graduate 2889.0 45.0 0.0 N
313 Male No Not Graduate 2755.0 65.0 1.0 N
314 Male No Graduate 22500.0 103.0 1.0 Y
315 Female No Not Graduate 1963.0 53.0 1.0 Y
316 Female No Graduate 7441.0 194.0 1.0 N
317 Female No Graduate 4547.0 115.0 1.0 Y
318 Male Yes Not Graduate 4567.0 115.0 1.0 Y
319 Female No Not Graduate 2213.0 66.0 1.0 Y
320 Male Yes Graduate 8300.0 152.0 0.0 N
321 Male Yes Graduate 81000.0 360.0 0.0 N
322 Female No Not Graduate 3867.0 62.0 1.0 N
323 Male Yes Not Graduate 6096.0 218.0 0.0 N
324 Male Yes Not Graduate 4286.0 110.0 1.0 Y
325 Female Yes Not Graduate 5386.0 178.0 0.0 N
326 Female No Graduate 2995.0 60.0 1.0 Y
327 Female No Graduate 2600.0 160.0 1.0 N
328 Male Yes Graduate 21600.0 239.0 1.0 N
329 Male Yes Graduate 3798.0 112.0 1.0 Y
330 Male Yes Graduate 4663.0 138.0 1.0 Y
331 Male Yes Graduate 5829.0 138.0 1.0 Y
332 Male Yes Graduate 3539.0 100.0 1.0 Y
333 Male Yes Graduate 14880.0 96.0 1.0 Y
334 Male Yes Graduate 6966.0 121.0 1.0 Y
335 Female No Not Graduate 4606.0 81.0 1.0 N
336 Male Yes Graduate 5935.0 133.0 1.0 Y
337 Male Yes Graduate 2936.12000084 87.0 1.0 Y
338 Male No Not Graduate 2717.0 60.0 1.0 Y
339 Female No Graduate 8624.0 150.0 1.0 Y
340 Male No Graduate 6500.0 105.0 0.0 N
341 Male Yes Graduate 4765.0 143.0 1.0 Y
342 Male No Graduate 3750.0 100.0 1.0 Y
343 Male No Graduate 3777.0 50.0 1.0 Y
344 Male No Graduate 10416.0 187.0 0.0 N
345 Female Yes Not Graduate 7142.0 138.0 1.0 Y
346 Male No Graduate 8724.0 187.0 1.0 Y
347 Male Yes Graduate 9734.0 180.0 1.0 Y
348 Male No Not Graduate 6700.0 148.0 1.0 Y
349 Male No Graduate 37719.0 152.0 1.0 Y
350 Male Yes Graduate 4676.0 130.0 1.0 Y
351 Male Yes Not Graduate 4652.0 110.0 1.0 Y
352 Male Yes Graduate 5050.0 150.0 0.0 N
353 Male Yes Not Graduate 3564.0 125.0 0.0 N
354 Male Yes Graduate 5681.0 149.0 0.0 N
355 Male Yes Graduate 4949.0 90.0 0.0 Y
356 Male No Graduate 7085.0 84.0 1.0 Y
357 Male Yes Graduate 3859.0 96.0 1.0 Y
358 Male Yes Graduate 4301.0 118.0 1.0 Y
359 Male Yes Graduate 6277.0 173.0 1.0 N
360 Male No Graduate 4354.0 136.0 1.0 Y
361 Male Yes Graduate 8334.0 160.0 1.0 N
362 Male Yes Graduate 7740.0 128.0 1.0 Y
363 Male Yes Graduate 5203.0 153.0 1.0 Y
364 Male No Graduate 4166.0 98.0 0.0 N
365 Male No Graduate 6000.0 140.0 1.0 Y
366 Male Yes Not Graduate 4611.0 70.0 0.0 N
367 Male Yes Graduate 6784.0 110.0 1.0 N
368 Male Yes Graduate 5529.0 162.0 1.0 Y
369 Male Yes Not Graduate 4153.0 113.0 0.0 N
370 Male Yes Graduate 4691.0 100.0 1.0 Y
371 Male No Graduate 10180.0 162.0 1.0 Y
372 Male Yes Graduate 17539.0 150.0 1.0 Y
373 Male Yes Graduate 8450.0 230.0 1.0 Y
374 Male Yes Graduate 18917.0 86.0 1.0 Y
375 Female No Not Graduate 4350.0 154.0 1.0 Y
376 Male Yes Not Graduate 3095.0 113.0 1.0 Y
377 Male Yes Graduate 5233.0 128.0 1.0 Y
378 Male Yes Graduate 10833.0 234.0 1.0 Y
379 Male Yes Graduate 8333.0 246.0 1.0 Y
380 Male Yes Not Graduate 4394.0 131.0 1.0 Y
381 Male No Graduate 3547.0 80.0 0.0 N
382 Male Yes Graduate 18333.0 500.0 1.0 N
383 Male Yes Graduate 6666.0 160.0 1.0 Y
384 Male No Graduate 2435.0 75.0 1.0 N
385 Male No Not Graduate 3691.0 110.0 1.0 Y
386 Female No Not Graduate 17263.0 225.0 1.0 Y
387 Male Yes Graduate 5754.0 119.0 0.0 N
388 Female Yes Graduate 4239.0 105.0 1.0 Y
389 Male Yes Not Graduate 4300.0 107.0 1.0 Y
390 Male Yes Graduate 2895.0 95.0 1.0 Y
391 Male No Graduate 10699.0 209.0 0.0 N
392 Female No Graduate 4328.0 113.0 1.0 Y
393 Female No Graduate 3159.0 100.0 1.0 Y
394 Male Yes Graduate 10489.0 208.0 1.0 Y
395 Male Yes Not Graduate 5297.0 124.0 1.0 Y
396 Male Yes Graduate 7926.0 243.0 1.0 Y
397 Male Yes Graduate 5492.0 188.0 1.0 Y
398 Female No Graduate 13262.0 40.0 1.0 Y
399 Male No Not Graduate 4885.0 100.0 1.0 N
400 Male Yes Graduate 8069.0 250.0 1.0 Y
401 Male Yes Graduate 5318.0 148.0 1.0 Y
402 Male Yes Graduate 8796.0 70.0 1.0 N
403 Male No Graduate 9481.0 311.0 1.0 N
404 Male Yes Graduate 6894.0 150.0 1.0 Y
405 Female Yes Graduate 3663.0 113.0 1.0 Y
406 Male No Graduate 6598.0 185.0 1.0 N
407 Female No Not Graduate 3400.0 95.0 1.0 N
408 Male Yes Not Graduate 3934.0 45.0 1.0 Y
409 Male No Graduate 2500.0 55.0 1.0 Y
410 Male Yes Graduate 7101.0 100.0 1.0 Y
411 Male Yes Graduate 15114.0 480.0 1.0 Y
412 Male Yes Graduate 17500.0 400.0 1.0 Y
413 Male Yes Graduate 3775.0 110.0 1.0 Y
414 Male Yes Not Graduate 6715.0 161.0 0.0 Y
415 Male No Not Graduate 3981.0 94.0 1.0 Y
416 Male No Not Graduate 6783.0 130.0 1.0 Y
417 Male Yes Graduate 4281.0 100.0 1.0 Y
418 Male No Graduate 3588.0 110.0 0.0 N
419 Female No Not Graduate 18165.0 125.0 1.0 Y
420 Male Yes Graduate 10039.0 324.0 1.0 Y
421 Male No Graduate 3617.0 107.0 1.0 Y
422 Male Yes Not Graduate 3453.0 66.0 1.0 N
423 Male Yes Graduate 6417.0 157.0 1.0 Y
424 Female Yes Graduate 7453.0 140.0 1.0 Y
425 Female No Graduate 2138.0 99.0 0.0 N
426 Male Yes Not Graduate 4763.0 128.0 1.0 Y
427 Male Yes Graduate 4718.0 155.0 1.0 Y
428 Male No Not Graduate 3358.0 80.0 1.0 N
429 Male No Graduate 4309.0 145.0 1.0 Y
430 Female No Graduate 5000.0 103.0 0.0 N
431 Male Yes Graduate 4801.0 110.0 1.0 Y
432 Male Yes Graduate 6583.0 158.0 1.0 Y
433 Male Yes Not Graduate 4787.0 181.0 0.0 N
434 Male Yes Graduate 7859.0 132.0 0.0 N
435 Male Yes Graduate 6500.0 26.0 1.0 Y
436 Male Yes Graduate 10139.0 260.0 1.0 Y
437 Male Yes Graduate 6556.0 162.0 1.0 Y
438 Female Yes Graduate 6486.0 182.0 1.0 Y
439 Male Yes Not Graduate 3917.0 108.0 1.0 Y
440 Female Yes Graduate 19484.0 600.0 1.0 Y
441 Male Yes Graduate 7977.0 211.0 1.0 Y
442 Male No Not Graduate 5800.0 132.0 1.0 Y
443 Male Yes Graduate 8799.0 258.0 0.0 N
444 Male No Graduate 3333.0 70.0 1.0 Y
445 Male Yes Graduate 5900.0 123.0 0.0 N
446 Female No Graduate 2378.0 9.0 1.0 N
447 Male Yes Graduate 5230.0 104.0 0.0 N
448 Male Yes Graduate 5167.0 186.0 1.0 Y
449 Male Yes Graduate 16666.0 275.0 1.0 Y
450 Male Yes Not Graduate 7750.0 187.0 1.0 N
451 Male Yes Graduate 6406.0 150.0 1.0 N
452 Male Yes Graduate 3620.0 108.0 1.0 Y
453 Male No Graduate 5968.0 110.0 1.0 Y
454 Male Yes Graduate 4014.0 107.0 1.0 Y
455 Male Yes Graduate 6540.0 205.0 1.0 Y
456 Male No Graduate 35673.0 90.0 1.0 N
457 Female Yes Graduate 3166.0 36.0 1.0 Y
458 Male Yes Graduate 4704.0 146.0 0.0 N
459 Male Yes Graduate 7283.0 172.0 1.0 N
460 Male Yes Graduate 3819.0 104.0 1.0 Y
461 Female No Not Graduate 2165.0 70.0 1.0 Y
462 Male Yes Graduate 2726.0 106.0 0.0 N
463 Male Yes Graduate 6416.0 56.0 1.0 Y
464 Male Yes Graduate 6000.0 205.0 1.0 N
465 Male Yes Graduate 7159.0 142.0 1.0 Y
466 Male Yes Graduate 16120.0 260.0 1.0 Y
467 Male No Not Graduate 3833.0 110.0 1.0 Y
468 Male Yes Not Graduate 7383.0 187.0 1.0 N
469 Male Yes Graduate 9963.0 180.0 1.0 Y
470 Male Yes Graduate 5780.0 192.0 1.0 Y
471 Male Yes Graduate 5703.0 128.0 1.0 Y
472 Male No Graduate 7977.0 172.0 1.0 Y
473 Female Yes Graduate 12000.0 496.0 1.0 Y
474 Male Yes Graduate 5900.0 173.0 1.0 Y
475 Male Yes Not Graduate 5398.0 157.0 1.0 Y
476 Male Yes Graduate 5182.0 108.0 1.0 Y
477 Female No Graduate 2900.0 71.0 1.0 Y
478 Male Yes Graduate 4106.0 40.0 1.0 Y
479 Male Yes Graduate 8312.0 253.0 1.0 Y
480 Male Yes Graduate 7583.0 187.0 1.0 Y
481 Female No Graduate 4583.0 133.0 0.0 N

View File

@@ -0,0 +1,600 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "5da8da61",
"metadata": {},
"source": [
"# Exercice 2: Classification system with KNN - To Loan or Not To Loan"
]
},
{
"cell_type": "markdown",
"id": "9669e493",
"metadata": {},
"source": [
"## Imports"
]
},
{
"cell_type": "markdown",
"id": "22bbd869",
"metadata": {},
"source": [
"Import some useful libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "26758936",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.preprocessing import OrdinalEncoder, StandardScaler\n",
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "markdown",
"id": "abc131ca",
"metadata": {},
"source": [
"## a. Getting started"
]
},
{
"cell_type": "markdown",
"id": "45b518e5",
"metadata": {},
"source": [
"### Data loading"
]
},
{
"cell_type": "markdown",
"id": "1ef061f2",
"metadata": {},
"source": [
"The original dataset comes from the Kaggle's [Loan Prediction](https://www.kaggle.com/ninzaami/loan-predication) problem. The provided dataset has already undergone some processing, such as removing some columns and invalid data. Pandas is used to read the CSV file."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "a23f62b5",
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv(\"loandata.csv\")"
]
},
{
"cell_type": "markdown",
"id": "02ca77c7",
"metadata": {},
"source": [
"Display the head of the data."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "f4bec500",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Gender</th>\n",
" <th>Married</th>\n",
" <th>Education</th>\n",
" <th>TotalIncome</th>\n",
" <th>LoanAmount</th>\n",
" <th>CreditHistory</th>\n",
" <th>LoanStatus</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Male</td>\n",
" <td>Yes</td>\n",
" <td>Graduate</td>\n",
" <td>6091.0</td>\n",
" <td>128.0</td>\n",
" <td>1.0</td>\n",
" <td>N</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Male</td>\n",
" <td>Yes</td>\n",
" <td>Graduate</td>\n",
" <td>3000.0</td>\n",
" <td>66.0</td>\n",
" <td>1.0</td>\n",
" <td>Y</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Male</td>\n",
" <td>Yes</td>\n",
" <td>Not Graduate</td>\n",
" <td>4941.0</td>\n",
" <td>120.0</td>\n",
" <td>1.0</td>\n",
" <td>Y</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Male</td>\n",
" <td>No</td>\n",
" <td>Graduate</td>\n",
" <td>6000.0</td>\n",
" <td>141.0</td>\n",
" <td>1.0</td>\n",
" <td>Y</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Male</td>\n",
" <td>Yes</td>\n",
" <td>Graduate</td>\n",
" <td>9613.0</td>\n",
" <td>267.0</td>\n",
" <td>1.0</td>\n",
" <td>Y</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Gender Married Education TotalIncome LoanAmount CreditHistory \\\n",
"0 Male Yes Graduate 6091.0 128.0 1.0 \n",
"1 Male Yes Graduate 3000.0 66.0 1.0 \n",
"2 Male Yes Not Graduate 4941.0 120.0 1.0 \n",
"3 Male No Graduate 6000.0 141.0 1.0 \n",
"4 Male Yes Graduate 9613.0 267.0 1.0 \n",
"\n",
" LoanStatus \n",
"0 N \n",
"1 Y \n",
"2 Y \n",
"3 Y \n",
"4 Y "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "markdown",
"id": "e271b475",
"metadata": {},
"source": [
"Data's columns:\n",
"* **Gender:** Applicant gender (Male/ Female)\n",
"* **Married:** Is the Applicant married? (Y/N)\n",
"* **Education:** Applicant Education (Graduate/ Not Graduate)\n",
"* **TotalIncome:** Applicant total income (sum of `ApplicantIncome` and `CoapplicantIncome` columns in the original dataset)\n",
"* **LoanAmount:** Loan amount in thousands\n",
"* **CreditHistory:** Credit history meets guidelines\n",
"* **LoanStatus** (Target)**:** Loan approved (Y/N)"
]
},
{
"cell_type": "markdown",
"id": "702ce4e6",
"metadata": {},
"source": [
"### Data preprocessing"
]
},
{
"cell_type": "markdown",
"id": "7fce724c",
"metadata": {},
"source": [
"Define a list of categorical columns to encode."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "2c56efa5",
"metadata": {},
"outputs": [],
"source": [
"categorical_columns = [\"Gender\", \"Married\", \"Education\", \"LoanStatus\"]"
]
},
{
"cell_type": "markdown",
"id": "d8915a68",
"metadata": {},
"source": [
"Encode categorical columns using the [`OrdinalEncoder`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html) of scikit learn."
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "dc5f9cda",
"metadata": {},
"outputs": [],
"source": [
"data[categorical_columns] = OrdinalEncoder().fit_transform(data[categorical_columns])"
]
},
{
"cell_type": "markdown",
"id": "df9c84b4",
"metadata": {},
"source": [
"Split into `X` and `y`."
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "83beacfb",
"metadata": {},
"outputs": [],
"source": [
"X = data.drop(columns=\"LoanStatus\")\n",
"y = data.LoanStatus"
]
},
{
"cell_type": "markdown",
"id": "e25c8f24",
"metadata": {},
"source": [
"Normalize data using the [`StandardScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) of scikit learn."
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "9c567bb7",
"metadata": {},
"outputs": [],
"source": [
"X[X.columns] = StandardScaler().fit_transform(X[X.columns])"
]
},
{
"cell_type": "markdown",
"id": "7437ea21",
"metadata": {},
"source": [
"Convert `y` type to `int` "
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "c0db7c1f",
"metadata": {},
"outputs": [],
"source": [
"y = y.astype(int)"
]
},
{
"cell_type": "markdown",
"id": "6d1d1f10",
"metadata": {},
"source": [
"Split dataset into train and test sets."
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "b05be2cc",
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)"
]
},
{
"cell_type": "markdown",
"id": "8f6d3ce6",
"metadata": {},
"source": [
"## b. Dummy classifier"
]
},
{
"cell_type": "markdown",
"id": "80ec4058",
"metadata": {},
"source": [
"Build a dummy classifier that takes decisions randomly."
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "30919672",
"metadata": {},
"outputs": [],
"source": [
"class DummyClassifier():\n",
" \n",
" def __init__(self):\n",
" \"\"\"\n",
" Initialize the class.\n",
" \"\"\"\n",
" pass\n",
" \n",
" def fit(self, X, y):\n",
" \"\"\"\n",
" Fit the dummy classifier.\n",
" \n",
" Parameters\n",
" ----------\n",
" X : Numpy array or Pandas DataFrame of shape (n_samples, n_features)\n",
" Training data.\n",
" y : Numpy array or Pandas DataFrame of shape (n_samples,)\n",
" Target values.\n",
" \"\"\"\n",
" pass\n",
" \n",
" def predict(self, X):\n",
" \"\"\"\n",
" Predict the class labels for the provided data.\n",
"\n",
" Parameters\n",
" ----------\n",
" X : Numpy array or Pandas DataFrame of shape (n_queries, n_features)\n",
" Test samples.\n",
"\n",
" Returns\n",
" -------\n",
" y : Numpy array or Pandas DataFrame of shape (n_queries,)\n",
" Class labels for each data sample.\n",
" \"\"\"\n",
" pass"
]
},
{
"cell_type": "markdown",
"id": "1dd67c48",
"metadata": {},
"source": [
"Implement a function to evaluate the performance of a classification by computing the accuracy ($N_{correct}/N$)."
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "184f3905",
"metadata": {},
"outputs": [],
"source": [
"def accuracy_score(y_true, y_pred):\n",
" pass"
]
},
{
"cell_type": "markdown",
"id": "90dcae17",
"metadata": {},
"source": [
"Compute the performance of the dummy classifier using the provided test set."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa666b66",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "9e10cd97",
"metadata": {},
"source": [
"## c. K-Nearest Neighbors classifier"
]
},
{
"cell_type": "markdown",
"id": "70009457",
"metadata": {},
"source": [
"Build a K-Nearest Neighbors classifier using an Euclidian distance computation and a simple majority voting criterion."
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "759e924e",
"metadata": {},
"outputs": [],
"source": [
"class KNNClassifier():\n",
" \n",
" def __init__(self, n_neighbors=3):\n",
" \"\"\"\n",
" Initialize the class.\n",
" \n",
" Parameters\n",
" ----------\n",
" n_neighbors : int, default=3\n",
" Number of neighbors to use by default.\n",
" \"\"\"\n",
" pass\n",
" \n",
" def fit(self, X, y):\n",
" \"\"\"\n",
" Fit the k-nearest neighbors classifier.\n",
" \n",
" Parameters\n",
" ----------\n",
" X : Numpy array or Pandas DataFrame of shape (n_samples, n_features)\n",
" Training data.\n",
" y : Numpy array or Pandas DataFrame of shape (n_samples,)\n",
" Target values.\n",
" \"\"\"\n",
" pass\n",
" \n",
" @staticmethod\n",
" def _euclidian_distance(a, b):\n",
" \"\"\"\n",
" Utility function to compute the euclidian distance.\n",
" \n",
" Parameters\n",
" ----------\n",
" a : Numpy array or Pandas DataFrame\n",
" First operand.\n",
" b : Numpy array or Pandas DataFrame\n",
" Second operand.\n",
" \"\"\"\n",
" pass\n",
" \n",
" def predict(self, X):\n",
" \"\"\"\n",
" Predict the class labels for the provided data.\n",
"\n",
" Parameters\n",
" ----------\n",
" X : Numpy array or Pandas DataFrame of shape (n_queries, n_features)\n",
" Test samples.\n",
"\n",
" Returns\n",
" -------\n",
" y : Numpy array or Pandas DataFrame of shape (n_queries,)\n",
" Class labels for each data sample.\n",
" \"\"\"\n",
" pass"
]
},
{
"cell_type": "markdown",
"id": "6c2b4811",
"metadata": {},
"source": [
"Compute the performance of the system as a function of $k = 1...7$."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cf589e66",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "71c51f35",
"metadata": {},
"source": [
"Run the KNN algorithm using only the features `TotalIncome` and `CreditHistory`."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2f6f262b",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "e2b1a682",
"metadata": {},
"source": [
"Re-run the KNN algorithm using the features `TotalIncome`, `CreditHistory` and `Married`."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c0bda7ee",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "2724167a",
"metadata": {},
"source": [
"Re-run the KNN algorithm using all features."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "46ec9699",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "648aa52e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,666 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# k-Nearest Neighbor (kNN) exercise 3 - MNIST Dataset\n",
"\n",
"*Complete and hand in this completed worksheet.*\n",
"\n",
"The kNN classifier consists of two stages:\n",
"\n",
"- During training, the classifier takes the training data and simply remembers it\n",
"- During testing, kNN classifies every test image by comparing to all training images and transfering the labels of the k most similar training examples\n",
"- In this exercise, the ultimate goal is to find an optimal value of hyper-parameter k through a cross-validation procedure.\n",
"\n",
"In this exercise you will implement these steps and gain proficiency in writing efficient, vectorized code."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Run some setup code for this notebook.\n",
"\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import os\n",
"\n",
"# This is a bit of magic to make matplotlib figures appear inline in the notebook\n",
"# rather than in a new window. Also setting some parameters for display.\n",
"%matplotlib inline\n",
"plt.rcParams['figure.figsize'] = (10.0, 10.0) # set default size of plots\n",
"plt.rcParams['image.interpolation'] = 'nearest'\n",
"plt.rcParams['image.cmap'] = 'gray'\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# This is a method to read the MNIST dataset from a ROOT directory\n",
"def load_MNIST(ROOT):\n",
" '''load all of mnist\n",
" training set first'''\n",
" Xtr = []\n",
" train = pd.read_csv(os.path.join(ROOT, 'mnist_train.csv'))\n",
" X = np.array(train.drop('label', axis=1))\n",
" Ytr = np.array(train['label'])\n",
" # With this for-loop we give the data a shape of the acctual image (28x28)\n",
" # instead of the shape in file (1x784)\n",
" for row in X:\n",
" Xtr.append(row.reshape(28,28))\n",
" # load test set second\n",
" Xte = []\n",
" test = pd.read_csv(os.path.join(ROOT, 'mnist_test.csv'))\n",
" X = np.array(test.drop('label', axis=1))\n",
" Yte = np.array(test['label'])\n",
" # same reshaping\n",
" for row in X:\n",
" Xte.append(row.reshape(28,28))\n",
" \n",
" return np.array(Xtr), np.array(Ytr), np.array(Xte), np.array(Yte)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load the raw MNIST data.\n",
"mnist_dir = 'YOUR-MNIST-DIR-HERE' # TODO: update this dir information to your own dir\n",
"X_train, y_train, X_test, y_test = load_MNIST(mnist_dir)\n",
"\n",
"# As a sanity check, we print out the size of the training and test data.\n",
"print('Training data shape: ', X_train.shape)\n",
"print('Training labels shape: ', y_train.shape)\n",
"print('Test data shape: ', X_test.shape)\n",
"print('Test labels shape: ', y_test.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Inline Question #1:** Notice the outputs of the shape attributes for the numpy arrays downloaded.\n",
"\n",
"- What are the ranks of the arrays for the training data and test data?\n",
"- Are the shapes coherent from the description of the dataset that we can find [here](http://yann.lecun.com/exdb/mnist/)? Explain the different dimensions of the 4 arrays in cell above."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Your Answer**: *fill this in.*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Now let's visualise some of the images\n",
"classes = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']\n",
"num_classes = len(classes)\n",
"samples_per_class = 7\n",
"for y, cls in enumerate(classes): # y and cls takes values from 0-9\n",
" idxs = np.flatnonzero(y_train == y) # gets the indices of samples that corresponds to class y\n",
" idxs = np.random.choice(idxs, samples_per_class, replace=False) # picks randomly samples_per_class indices\n",
" for i, idx in enumerate(idxs):\n",
" plt_idx = i * num_classes + y + 1 # determines the sub-plot index\n",
" plt.subplot(samples_per_class, num_classes, plt_idx)\n",
" plt.imshow(X_train[idx].astype('uint8'))\n",
" plt.axis('off')\n",
" if i == 0:\n",
" plt.title(cls)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Subsample the data for more efficient code execution in this exercise. We do this to make it go faster. \n",
"# When you will have completed the whole notebook, you can run it again on a larger (or total) dataset \n",
"# and observe the difference in terms of accuracy (and speedup).\n",
"num_training = 5000\n",
"mask = range(num_training)\n",
"X_train = X_train[mask]\n",
"y_train = y_train[mask]\n",
"\n",
"num_test = 500\n",
"mask = range(num_test)\n",
"X_test = X_test[mask]\n",
"y_test = y_test[mask]\n",
"\n",
"# TODO: sanity check: write code to print out the size of the subsampled training and test data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Shape the images vectors\n",
"X_train = np.reshape(X_train, (X_train.shape[0], -1)) # when reshaping, -1 means \"infer target dims from orig dims\n",
"X_test = np.reshape(X_test, (X_test.shape[0], -1)) # in this case it flattens the (28,28,3) into 3072 \n",
"print(X_train.shape, X_test.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Inline Question #2:** Notice the use of np.reshape to transform images into vectors.\n",
"\n",
"- What is the effect of -1 in the reshape command?\n",
"- Are the shapes coherent from this vectorization? Explain."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Your Answer**: *fill this in.*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# This is a class definition for our KNN classifier. Complete the code indicated by the TODO sections.\n",
"import numpy as np\n",
"\n",
"class KNearestNeighbor(object):\n",
" \"\"\" a kNN classifier with L2 distance \"\"\"\n",
"\n",
" def __init__(self):\n",
" pass\n",
"\n",
" def train(self, X, y):\n",
" \"\"\"\n",
" Train the classifier. For k-nearest neighbors this is just \n",
" memorizing the training data.\n",
"\n",
" Inputs:\n",
" - X: A numpy array of shape (num_train, D) containing the training data\n",
" consisting of num_train samples each of dimension D.\n",
" - y: A numpy array of shape (N,) containing the training labels, where\n",
" y[i] is the label for X[i].\n",
" \"\"\"\n",
" self.X_train = X\n",
" self.y_train = y\n",
" \n",
" def predict(self, X, k=1, num_loops=0):\n",
" \"\"\"\n",
" Predict labels for test data using this classifier.\n",
"\n",
" Inputs:\n",
" - X: A numpy array of shape (num_test, D) containing test data consisting\n",
" of num_test samples each of dimension D.\n",
" - k: The number of nearest neighbors that vote for the predicted labels.\n",
" - num_loops: Determines which implementation to use to compute distances\n",
" between training points and testing points.\n",
"\n",
" Returns:\n",
" - y: A numpy array of shape (num_test,) containing predicted labels for the\n",
" test data, where y[i] is the predicted label for the test point X[i]. \n",
" \"\"\"\n",
" if num_loops == 0:\n",
" dists = self.compute_distances_no_loops(X)\n",
" elif num_loops == 1:\n",
" dists = self.compute_distances_one_loop(X)\n",
" elif num_loops == 2:\n",
" dists = self.compute_distances_two_loops(X)\n",
" else:\n",
" raise ValueError('Invalid value %d for num_loops' % num_loops)\n",
"\n",
" return self.predict_labels(dists, k=k)\n",
"\n",
" def compute_distances_two_loops(self, X):\n",
" \"\"\"\n",
" Compute the distance between each test point in X and each training point\n",
" in self.X_train using a nested loop over both the training data and the \n",
" test data.\n",
"\n",
" Inputs:\n",
" - X: A numpy array of shape (num_test, D) containing test data.\n",
"\n",
" Returns:\n",
" - dists: A numpy array of shape (num_test, num_train) where dists[i, j]\n",
" is the Euclidean distance between the ith test point and the jth training\n",
" point.\n",
" \"\"\"\n",
" num_test = X.shape[0]\n",
" num_train = self.X_train.shape[0]\n",
" dists = np.zeros((num_test, num_train))\n",
" for i in range(num_test):\n",
" for j in range(num_train):\n",
" #####################################################################\n",
" # TODO: #\n",
" # Compute the l2 distance between the ith test point and the jth #\n",
" # training point, and store the result in dists[i, j]. You should #\n",
" # not use a loop over dimension. #\n",
" #####################################################################\n",
" pass\n",
" \n",
" #####################################################################\n",
" # END OF YOUR CODE #\n",
" #####################################################################\n",
" return dists\n",
"\n",
" def compute_distances_one_loop(self, X):\n",
" \"\"\"\n",
" Compute the distance between each test point in X and each training point\n",
" in self.X_train using a single loop over the test data.\n",
"\n",
" Input / Output: Same as compute_distances_two_loops\n",
" \"\"\"\n",
" num_test = X.shape[0]\n",
" num_train = self.X_train.shape[0]\n",
" dists = np.zeros((num_test, num_train))\n",
" for i in range(num_test):\n",
" #######################################################################\n",
" # TODO: #\n",
" # Compute the l2 distance between the ith test point and all training #\n",
" # points, and store the result in dists[i, :]. #\n",
" #######################################################################\n",
" pass\n",
" \n",
" #######################################################################\n",
" # END OF YOUR CODE #\n",
" #######################################################################\n",
" return dists\n",
"\n",
" def compute_distances_no_loops(self, X):\n",
" \"\"\"\n",
" Compute the distance between each test point in X and each training point\n",
" in self.X_train using no explicit loops.\n",
"\n",
" Input / Output: Same as compute_distances_two_loops\n",
" \"\"\"\n",
" num_test = X.shape[0]\n",
" num_train = self.X_train.shape[0]\n",
" dists = np.zeros((num_test, num_train)) \n",
" #########################################################################\n",
" # TODO: #\n",
" # Compute the l2 distance between all test points and all training #\n",
" # points without using any explicit loops, and store the result in #\n",
" # dists. #\n",
" # #\n",
" # You should implement this function using only basic array operations; #\n",
" # in particular you should not use functions from scipy. #\n",
" # #\n",
" # HINT: Try to formulate the l2 distance using matrix multiplication #\n",
" # and two broadcast sums. #\n",
" #########################################################################\n",
" pass\n",
"\n",
" #########################################################################\n",
" # END OF YOUR CODE #\n",
" #########################################################################\n",
" return dists\n",
"\n",
" def predict_labels(self, dists, k=1):\n",
" \"\"\"\n",
" Given a matrix of distances between test points and training points,\n",
" predict a label for each test point.\n",
"\n",
" Inputs:\n",
" - dists: A numpy array of shape (num_test, num_train) where dists[i, j]\n",
" gives the distance betwen the ith test point and the jth training point.\n",
"\n",
" Returns:\n",
" - y: A numpy array of shape (num_test,) containing predicted labels for the\n",
" test data, where y[i] is the predicted label for the test point X[i]. \n",
" \"\"\"\n",
" num_test = dists.shape[0]\n",
" y_pred = np.zeros(num_test)\n",
" for i in range(num_test):\n",
" # A list of length k storing the labels of the k nearest neighbors to\n",
" # the ith test point.\n",
" closest_y = []\n",
" #########################################################################\n",
" # TODO: #\n",
" # Use the distance matrix to find the k nearest neighbors of the ith #\n",
" # testing point, and use self.y_train to find the labels of these #\n",
" # neighbors. Store these labels in closest_y. #\n",
" # Hint: Look up the function numpy.argsort. #\n",
" #########################################################################\n",
" pass\n",
" \n",
" #########################################################################\n",
" # TODO: #\n",
" # Now that you have found the labels of the k nearest neighbors, you #\n",
" # need to find the most common label in the list closest_y of labels. #\n",
" # Store this label in y_pred[i]. Break ties by choosing the smaller #\n",
" # label. #\n",
" #########################################################################\n",
" pass\n",
" \n",
" #########################################################################\n",
" # END OF YOUR CODE # \n",
" #########################################################################\n",
"\n",
" return y_pred"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create a kNN classifier instance. \n",
"# Remember that training a kNN classifier is a noop: \n",
"# the Classifier simply remembers the data and does no further processing \n",
"classifier = KNearestNeighbor()\n",
"classifier.train(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# TODO: implement compute_distances_two_loops from the knn class definition above\n",
"\n",
"# Test your implementation:\n",
"dists = classifier.compute_distances_two_loops(X_test)\n",
"print(dists.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# We can visualize the distance matrix: each row is a single test example and\n",
"# its distances to training examples\n",
"plt.imshow(dists, interpolation='none')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Inline Question #3:** Notice the structured patterns in the distance matrix, where some rows or columns are visible brighter. (Note that with the default color scheme black indicates low distances while white indicates high distances.)\n",
"\n",
"- What in the data is the cause behind the distinctly bright rows?\n",
"- What causes the bright columns?"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Your Answer**: *fill this in.*\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# TODO : Now implement the function predict_labels from the KNN class above and run the code below:\n",
"# We use k = 1 (which is Nearest Neighbor).\n",
"y_test_pred = classifier.predict_labels(dists, k=1)\n",
"\n",
"# Compute and print the fraction of correctly predicted examples\n",
"num_correct = np.sum(y_test_pred == y_test)\n",
"accuracy = float(num_correct) / num_test\n",
"print('Got %d / %d correct => accuracy: %f' % (int(num_correct), num_test, accuracy))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You should expect to see approximately `90%` accuracy. Now lets try out a larger `k`, say `k = 5`:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y_test_pred = classifier.predict_labels(dists, k=5)\n",
"num_correct = np.sum(y_test_pred == y_test)\n",
"accuracy = float(num_correct) / num_test\n",
"print('Got %d / %d correct => accuracy: %f' % (int(num_correct), num_test, accuracy))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You should expect to see a slightly better performance than with `k = 1`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Now lets speed up distance matrix computation by using partial vectorization\n",
"# with one loop. Implement the function compute_distances_one_loop and run the\n",
"# code below:\n",
"dists_one = classifier.compute_distances_one_loop(X_test)\n",
"\n",
"# To ensure that our vectorized implementation is correct, we make sure that it\n",
"# agrees with the naive implementation. There are many ways to decide whether\n",
"# two matrices are similar; one of the simplest is the Frobenius norm. In case\n",
"# you haven't seen it before, the Frobenius norm of two matrices is the square\n",
"# root of the squared sum of differences of all elements; in other words, reshape\n",
"# the matrices into vectors and compute the Euclidean distance between them.\n",
"difference = np.linalg.norm(dists - dists_one, ord='fro')\n",
"print('Difference was: %f' % (difference, ))\n",
"if difference < 0.001:\n",
" print('Good! The distance matrices are the same')\n",
"else:\n",
" print('Uh-oh! The distance matrices are different')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Now implement the fully vectorized version inside compute_distances_no_loops\n",
"# and run the code\n",
"dists_two = classifier.compute_distances_no_loops(X_test)\n",
"\n",
"# check that the distance matrix agrees with the one we computed before:\n",
"difference = np.linalg.norm(dists - dists_two, ord='fro')\n",
"print('Difference was: %f' % (difference, ))\n",
"if difference < 0.001:\n",
" print('Good! The distance matrices are the same')\n",
"else:\n",
" print('Uh-oh! The distance matrices are different')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Let's compare how fast the implementations are\n",
"def time_function(f, *args):\n",
" \"\"\"\n",
" Call a function f with args and return the time (in seconds) that it took to execute.\n",
" \"\"\"\n",
" import time\n",
" tic = time.time()\n",
" f(*args)\n",
" toc = time.time()\n",
" return toc - tic\n",
"\n",
"two_loop_time = time_function(classifier.compute_distances_two_loops, X_test)\n",
"print('Two loop version took %f seconds' % two_loop_time)\n",
"\n",
"one_loop_time = time_function(classifier.compute_distances_one_loop, X_test)\n",
"print('One loop version took %f seconds' % one_loop_time)\n",
"\n",
"no_loop_time = time_function(classifier.compute_distances_no_loops, X_test)\n",
"print('No loop version took %f seconds' % no_loop_time)\n",
"\n",
"# you should see significantly faster performance with the fully vectorized implementation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Cross-validation\n",
"\n",
"We have implemented the k-Nearest Neighbor classifier but we set the value k = 5 arbitrarily. We will now determine the best value of this hyperparameter with cross-validation."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"num_folds = 5\n",
"k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50]\n",
"\n",
"#X_train_folds = []\n",
"#y_train_folds = []\n",
"################################################################################\n",
"# TODO: #\n",
"# Split up the training data into folds. After splitting, X_train_folds and #\n",
"# y_train_folds should each be lists of length num_folds, where #\n",
"# y_train_folds[i] is the label vector for the points in X_train_folds[i]. #\n",
"# Hint: Look up the numpy array_split function. #\n",
"################################################################################\n",
"pass\n",
"\n",
"################################################################################\n",
"# END OF YOUR CODE #\n",
"################################################################################\n",
"\n",
"# A dictionary holding the accuracies for different values of k that we find\n",
"# when running cross-validation. After running cross-validation,\n",
"# k_to_accuracies[k] should be a list of length num_folds giving the different\n",
"# accuracy values that we found when using that value of k.\n",
"k_to_accuracies = {}\n",
"\n",
"\n",
"################################################################################\n",
"# TODO: #\n",
"# Perform k-fold cross validation to find the best value of k. For each #\n",
"# possible value of k, run the k-nearest-neighbor algorithm num_folds times, #\n",
"# where in each case you use all but one of the folds as training data and the #\n",
"# last fold as a validation set. Store the accuracies for all fold and all #\n",
"# values of k in the k_to_accuracies dictionary. #\n",
"################################################################################\n",
"pass\n",
"\n",
"################################################################################\n",
"# END OF YOUR CODE #\n",
"################################################################################\n",
"\n",
"# Print out the computed accuracies\n",
"for k in sorted(k_to_accuracies):\n",
" for accuracy in k_to_accuracies[k]:\n",
" print('k = %d, accuracy = %f' % (k, accuracy))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# plot the raw observations\n",
"for k in k_choices:\n",
" accuracies = k_to_accuracies[k]\n",
" plt.scatter([k] * len(accuracies), accuracies)\n",
"\n",
"# plot the trend line with error bars that correspond to standard deviation\n",
"accuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())])\n",
"accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())])\n",
"plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)\n",
"plt.title('Cross-validation on k')\n",
"plt.xlabel('k')\n",
"plt.ylabel('Cross-validation accuracy')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Based on the cross-validation results above, choose the best value for k, \n",
"# retrain the classifier using all the training data, and test it on the test\n",
"# data. You should be able to get above 90% accuracy on the test data.\n",
"best_k = 1 # TODO: put your best k value here\n",
"\n",
"classifier = KNearestNeighbor()\n",
"classifier.train(X_train, y_train)\n",
"y_test_pred = classifier.predict(X_test, k=best_k)\n",
"\n",
"# Compute and display the accuracy\n",
"num_correct = np.sum(y_test_pred == y_test)\n",
"accuracy = float(num_correct) / num_test\n",
"print('Got %d / %d correct => accuracy: %f' % (int(num_correct), num_test, accuracy))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,28 @@
Country,GDP per capita (USD),Life satisfaction
Russia,26456.3879381321,5.8
Greece,27287.0834009302,5.4
Turkey,28384.9877846263,5.5
Latvia,29932.4939100562,5.9
Hungary,31007.7684065437,5.6
Portugal,32181.1545372343,5.4
Poland,32238.157259275,6.1
Estonia,35638.4213511812,5.7
Spain,36215.4475907307,6.3
Slovenia,36547.7389559849,5.9
Lithuania,36732.034744031,5.9
Israel,38341.3075704083,7.2
Italy,38992.1483807498,6.0
United Kingdom,41627.129269425,6.8
France,42025.6173730617,6.5
New Zealand,42404.3937381567,7.3
Canada,45856.6256264804,7.4
Finland,47260.800458441,7.6
Belgium,48210.0331113444,6.9
Australia,48697.8370282475,7.3
Sweden,50683.3235097178,7.3
Germany,50922.3580234484,7.0
Austria,51935.6038618156,7.1
Iceland,52279.7288513646,7.5
Netherlands,54209.5638357302,7.4
Denmark,55938.2128086032,7.6
United States,60235.7284916969,6.9
1 Country GDP per capita (USD) Life satisfaction
2 Russia 26456.3879381321 5.8
3 Greece 27287.0834009302 5.4
4 Turkey 28384.9877846263 5.5
5 Latvia 29932.4939100562 5.9
6 Hungary 31007.7684065437 5.6
7 Portugal 32181.1545372343 5.4
8 Poland 32238.157259275 6.1
9 Estonia 35638.4213511812 5.7
10 Spain 36215.4475907307 6.3
11 Slovenia 36547.7389559849 5.9
12 Lithuania 36732.034744031 5.9
13 Israel 38341.3075704083 7.2
14 Italy 38992.1483807498 6.0
15 United Kingdom 41627.129269425 6.8
16 France 42025.6173730617 6.5
17 New Zealand 42404.3937381567 7.3
18 Canada 45856.6256264804 7.4
19 Finland 47260.800458441 7.6
20 Belgium 48210.0331113444 6.9
21 Australia 48697.8370282475 7.3
22 Sweden 50683.3235097178 7.3
23 Germany 50922.3580234484 7.0
24 Austria 51935.6038618156 7.1
25 Iceland 52279.7288513646 7.5
26 Netherlands 54209.5638357302 7.4
27 Denmark 55938.2128086032 7.6
28 United States 60235.7284916969 6.9

View File

@@ -0,0 +1,258 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "b94b0451",
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.linear_model import LinearRegression\n",
"\n",
"# Download and prepare the data\n",
"lifesat = pd.read_csv(\"lifesat.csv\")\n",
"X = lifesat[[\"GDP per capita (USD)\"]].values\n",
"y = lifesat[[\"Life satisfaction\"]].values\n",
"\n",
"# Visualize the data\n",
"lifesat.plot(kind='scatter', grid=True,\n",
" x=\"GDP per capita (USD)\", y=\"Life satisfaction\")\n",
"plt.axis([23_500, 62_500, 4, 9])\n",
"plt.show()\n",
"\n",
"# Select a linear model\n",
"model = LinearRegression()\n",
"\n",
"# Train the model\n",
"model.fit(X, y)\n",
"\n",
"# Make a prediction for Cyprus\n",
"X_new = [[37_655.2]] # Cyprus' GDP per capita in 2020\n",
"print(model.predict(X_new)) # outputs [[6.30165767]]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "94fda07f",
"metadata": {},
"outputs": [],
"source": [
"X_test = np.linspace(25000, 60000, 200)\n",
"X_test = [[value] for value in X_test]\n",
"y_test = model.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "838b0242",
"metadata": {},
"outputs": [],
"source": [
"# Visualize the data\n",
"lifesat.plot(kind='scatter', grid=True,\n",
" x=\"GDP per capita (USD)\", y=\"Life satisfaction\")\n",
"plt.axis([23_500, 62_500, 4, 9])\n",
"plt.plot(X_test, y_test, color='red')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aa14a4ca",
"metadata": {},
"outputs": [],
"source": [
"class KNearestNeighborRegressor(object):\n",
" \"\"\" a kNN regressor with L2 distance \"\"\"\n",
"\n",
" def __init__(self):\n",
" pass\n",
"\n",
" def train(self, X, y):\n",
" \"\"\"\n",
" Train the classifier. For k-nearest neighbors this is just \n",
" memorizing the training data.\n",
"\n",
" Inputs:\n",
" - X: A numpy array of shape (num_train, D) containing the training data\n",
" consisting of num_train samples each of dimension D.\n",
" - y: A numpy array of shape (N,) containing the training labels, where\n",
" y[i] is the label for X[i].\n",
" \"\"\"\n",
" self.X_train = X\n",
" self.y_train = y\n",
" \n",
" def predict(self, X, k=1):\n",
" \"\"\"\n",
" Predict labels for test data using this classifier.\n",
"\n",
" Inputs:\n",
" - X: A numpy array of shape (num_test, D) containing test data consisting\n",
" of num_test samples each of dimension D.\n",
" - k: The number of nearest neighbors that vote for the predicted labels.\n",
" - num_loops: Determines which implementation to use to compute distances\n",
" between training points and testing points.\n",
"\n",
" Returns:\n",
" - y: A numpy array of shape (num_test,) containing predicted labels for the\n",
" test data, where y[i] is the predicted label for the test point X[i]. \n",
" \"\"\"\n",
" dists = self.compute_distances(X)\n",
" \n",
" return self.predict_values(dists, k=k)\n",
"\n",
"\n",
" def compute_distances(self, X):\n",
" \"\"\"\n",
" Compute the distance between each test point in X and each training point\n",
" in self.X_train using a single loop over the test data.\n",
"\n",
" Inputs:\n",
" - X: A numpy array of shape (num_test, D) containing test data.\n",
"\n",
" Returns:\n",
" - dists: A numpy array of shape (num_test, num_train) where dists[i, j]\n",
" is the Euclidean distance between the ith test point and the jth training\n",
" point.\n",
" \"\"\"\n",
" num_test = X.shape[0]\n",
" num_train = self.X_train.shape[0]\n",
" dists = np.zeros((num_test, num_train))\n",
" for i in range(num_test):\n",
" #######################################################################\n",
" # TODO: #\n",
" # Compute the l2 distance between the ith test point and all training #\n",
" # points, and store the result in dists[i, :]. #\n",
" #######################################################################\n",
" \n",
" pass\n",
" \n",
" #######################################################################\n",
" # END OF YOUR CODE #\n",
" #######################################################################\n",
" return dists\n",
"\n",
"\n",
"\n",
" def predict_values(self, dists, k=1):\n",
" \"\"\"\n",
" Given a matrix of distances between test points and training points,\n",
" predict a value for each test point.\n",
"\n",
" Inputs:\n",
" - dists: A numpy array of shape (num_test, num_train) where dists[i, j]\n",
" gives the distance betwen the ith test point and the jth training point.\n",
"\n",
" Returns:\n",
" - y: A numpy array of shape (num_test,) containing predicted values for the\n",
" test data, where y[i] is the predicted value for the test point X[i]. \n",
" \"\"\"\n",
" num_test = dists.shape[0]\n",
" y_pred = np.zeros(num_test)\n",
" for i in range(num_test):\n",
" # A list of length k storing the labels of the k nearest neighbors to\n",
" # the ith test point.\n",
" closest_y = []\n",
" \n",
" #########################################################################\n",
" # TODO: #\n",
" # Use the distance matrix to find the k nearest neighbors of the ith #\n",
" # testing point, and use self.y_train to find the labels of these #\n",
" # neighbors. Store these labels in closest_y. #\n",
" # Hint: Look up the function numpy.argsort. #\n",
" #########################################################################\n",
" \n",
" pass\n",
" \n",
" #########################################################################\n",
" # TODO: #\n",
" # Now that you have found the labels of the k nearest neighbors, you #\n",
" # need to compute the average of the target values corresponding to the #\n",
" # nearest neighbors. #\n",
" #########################################################################\n",
" \n",
" pass\n",
" \n",
" #########################################################################\n",
" # END OF YOUR CODE # \n",
" #########################################################################\n",
"\n",
" return y_pred"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "267d1168",
"metadata": {},
"outputs": [],
"source": [
"knn_reg = KNearestNeighborRegressor()\n",
"knn_reg.train(np.array(X), y)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fd8203ba",
"metadata": {},
"outputs": [],
"source": [
"y_hat_1 = knn_reg.predict(np.array(X_test), k=1)\n",
"y_hat_3 = knn_reg.predict(np.array(X_test), k=3)\n",
"y_hat_5 = knn_reg.predict(np.array(X_test), k=5)\n",
"y_hat_7 = knn_reg.predict(np.array(X_test), k=7)\n",
"y_hat_20 = knn_reg.predict(np.array(X_test), k=20)\n",
"y_hat_27 = knn_reg.predict(np.array(X_test), k=27)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d3704256",
"metadata": {},
"outputs": [],
"source": [
"# Visualize the data\n",
"lifesat.plot(kind='scatter', grid=True,\n",
" x=\"GDP per capita (USD)\", y=\"Life satisfaction\")\n",
"plt.axis([23_500, 62_500, 4, 9])\n",
"plt.plot(X_test, y_test, color='red')\n",
"plt.plot(X_test, y_hat_1, color='green')\n",
"# plt.plot(X_test, y_hat_3, color='blue')\n",
"# plt.plot(X_test, y_hat_5, color='magenta')\n",
"# plt.plot(X_test, y_hat_7, color='orange')\n",
"# plt.plot(X_test, y_hat_20, color='black')\n",
"# plt.plot(X_test, y_hat_27, color='grey')\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}