机器学习项目实战之贷款申请最大利润
来源:互联网 发布:hanlp分词算法 编辑:程序博客网 时间:2024/06/10 03:17
import pandas as pdloans_2007 = pd.read_csv("LoanStats3a.csv",skiprows=1)#数据清理过滤无用特征 half_count = len(loans_2007)/2loans_2007 = loans_2007.dropna(thresh=half_count,axis=1)loans_2007 = loans_2007.drop(['desc','url'],axis=1)loans_2007.to_csv("D:\test\machineLearning\loans_2007.csv",index=False)
import pandas as pdloans_2007 = pd.read_csv("D:\\test\\machineLearning\\loans_2007.csv")loans_2007.drop_duplicates()print (loans_2007.iloc[0])print (loans_2007.shape[1])
id 1077501member_id 1.2966e+06loan_amnt 5000funded_amnt 5000funded_amnt_inv 4975term 36 monthsint_rate 10.65%installment 162.87grade Bsub_grade B2emp_title NaNemp_length 10+ yearshome_ownership RENTannual_inc 24000verification_status Verifiedissue_d Dec-2011loan_status Fully Paidpymnt_plan npurpose credit_cardtitle Computerzip_code 860xxaddr_state AZdti 27.65delinq_2yrs 0earliest_cr_line Jan-1985inq_last_6mths 1open_acc 3pub_rec 0revol_bal 13648revol_util 83.7%total_acc 9initial_list_status fout_prncp 0out_prncp_inv 0total_pymnt 5863.16total_pymnt_inv 5833.84total_rec_prncp 5000total_rec_int 863.16total_rec_late_fee 0recoveries 0collection_recovery_fee 0last_pymnt_d Jan-2015last_pymnt_amnt 171.62last_credit_pull_d Nov-2016collections_12_mths_ex_med 0policy_code 1application_type INDIVIDUALacc_now_delinq 0chargeoff_within_12_mths 0delinq_amnt 0pub_rec_bankruptcies 0tax_liens 0Name: 0, dtype: object52
#数据预处理loans_2007 = loans_2007.drop(['id','member_id','funded_amnt','funded_amnt_inv','grade','sub_grade','emp_title','last_pymnt_d','last_pymnt_amnt'],axis=1)loans_2007 = loans_2007.drop(['zip_code','out_prncp','out_prncp_inv','total_pymnt','total_pymnt_inv','total_rec_prncp'],axis=1)loans_2007 = loans_2007.drop(['total_rec_int','total_rec_late_fee','recoveries','collection_recovery_fee','issue_d'],axis=1)print (loans_2007.iloc[0])print (loans_2007.shape[1])
loan_amnt 5000term 36 monthsint_rate 10.65%installment 162.87emp_length 10+ yearshome_ownership RENTannual_inc 24000verification_status Verifiedloan_status Fully Paidpymnt_plan npurpose credit_cardtitle Computeraddr_state AZdti 27.65delinq_2yrs 0earliest_cr_line Jan-1985inq_last_6mths 1open_acc 3pub_rec 0revol_bal 13648revol_util 83.7%total_acc 9initial_list_status flast_credit_pull_d Nov-2016collections_12_mths_ex_med 0policy_code 1application_type INDIVIDUALacc_now_delinq 0chargeoff_within_12_mths 0delinq_amnt 0pub_rec_bankruptcies 0tax_liens 0Name: 0, dtype: object32
#loan_status是当前贷款的状态 print (loans_2007["loan_status"].value_counts())
Fully Paid 33902Charged Off 5658Does not meet the credit policy. Status:Fully Paid 1988Does not meet the credit policy. Status:Charged Off 761Current 201Late (31-120 days) 10In Grace Period 9Late (16-30 days) 5Default 1Name: loan_status, dtype: int64
#Fully Paid代表已放款,Charged Off代表拒贷,进行二分类loans_2007 = loans_2007[(loans_2007['loan_status']=='Fully Paid') | (loans_2007['loan_status']=='Charged Off')]#将字符串转化成数字status_replace = { 'loan_status':{ 'Fully Paid':1, 'Charged Off':0, }}#将要替换的做成字典,key是对应的列loans_2007 = loans_2007.replace(status_replace)
orig_columns = loans_2007.columnsdrop_columns = []for col in orig_columns: col_series = loans_2007[col].dropna().unique() if len(col_series) == 1: #如果某一列都是一种值,也将其去掉 drop_columns.append(col)loans_2007 = loans_2007.drop(drop_columns,axis=1)print drop_columnsprint loans_2007.shape
['initial_list_status', 'collections_12_mths_ex_med', 'policy_code', 'application_type', 'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt', 'tax_liens'](39560, 24)
#检查是否有缺失值null_count=loans_2007.isnull().sum()print null_count
loan_amnt 0term 0int_rate 0installment 0emp_length 0home_ownership 0annual_inc 0verification_status 0loan_status 0pymnt_plan 0purpose 0title 10addr_state 0dti 0delinq_2yrs 0earliest_cr_line 0inq_last_6mths 0open_acc 0pub_rec 0revol_bal 0revol_util 50total_acc 0last_credit_pull_d 2pub_rec_bankruptcies 697dtype: int64
loans_2007 = loans_2007.drop("pub_rec_bankruptcies",axis=1)#去掉有缺失值的行loans_2007 = loans_2007.dropna(axis=0)print loans_2007.dtypes.value_counts()
object 12float64 10int64 1dtype: int64
#从以上结果看,由于sklearn只接受数值形的数据,不接受字符,所以显示为object,我们需要将其转化为字数值object_columns_df = loans_2007.select_dtypes(include=["object"])print object_columns_df.iloc[0]
term 36 monthsint_rate 10.65%emp_length 10+ yearshome_ownership RENTverification_status Verifiedpymnt_plan npurpose credit_cardtitle Computeraddr_state AZearliest_cr_line Jan-1985revol_util 83.7%last_credit_pull_d Nov-2016Name: 0, dtype: object
cols = ['home_ownership','verification_status','emp_length','term','addr_state']for c in cols: print loans_2007[c].value_counts()
RENT 18780MORTGAGE 17574OWN 3045OTHER 96NONE 3Name: home_ownership, dtype: int64Not Verified 16856Verified 12705Source Verified 9937Name: verification_status, dtype: int6410+ years 8821< 1 year 45632 years 43713 years 40744 years 34095 years 32701 year 32276 years 22127 years 17568 years 14729 years 1254n/a 1069Name: emp_length, dtype: int64 36 months 29041 60 months 10457Name: term, dtype: int64CA 7070NY 3788FL 2856TX 2714NJ 1838IL 1517PA 1504VA 1400GA 1393MA 1336OH 1208MD 1049AZ 874WA 834CO 786NC 780CT 747MI 722MO 682MN 611NV 492SC 470WI 453AL 446OR 445LA 435KY 325OK 298KS 269UT 256AR 243DC 211RI 198NM 188WV 176HI 172NH 172DE 113MT 84WY 83AK 79SD 63VT 54MS 19TN 17IN 9ID 6IA 5NE 5ME 3Name: addr_state, dtype: int64
mapping_dict = { "emp_length":{ "10+ years":10, "9 years":9, "8 years":8, "7 years":7, "6 years":6, "5 years":5, "4 years":4, "3 years":3, "2 years":2, "1 year":1, "< 1 year":0, "n/a":0 } }loans_2007 = loans_2007.drop(["last_credit_pull_d","earliest_cr_line","addr_state","title"],axis=1)#去掉%并转化为浮点型数据loans_2007["int_rate"]=loans_2007["int_rate"].str.rstrip("%").astype("float")loans_2007["revol_util"]=loans_2007["revol_util"].str.rstrip("%").astype("float")loans_2007 = loans_2007.replace(mapping_dict)
cat_columns = ['home_ownership','verification_status','emp_length','purpose','term']dummy_df = pd.get_dummies(loans_2007[cat_columns])loans_2007 = pd.concat([loans_2007,dummy_df],axis=1)loans_2007 = loans_2007.drop(cat_columns,axis=1)
#贷款这样的项目,精度高的意义不大,因为只要亏了一笔,最终可能亏损很大#所以要考虑ROC指标#以上为数据预处理的流程,获取已经整理好的数据loans = pd.read_csv("D:\\test\\machineLearning\\cleaned_loans2007.csv")print loans.info()
<class 'pandas.core.frame.DataFrame'>RangeIndex: 39498 entries, 0 to 39497Data columns (total 37 columns):loan_amnt 39498 non-null float64int_rate 39498 non-null float64installment 39498 non-null float64annual_inc 39498 non-null float64loan_status 39498 non-null int64dti 39498 non-null float64delinq_2yrs 39498 non-null float64inq_last_6mths 39498 non-null float64open_acc 39498 non-null float64pub_rec 39498 non-null float64revol_bal 39498 non-null float64revol_util 39498 non-null float64total_acc 39498 non-null float64home_ownership_MORTGAGE 39498 non-null int64home_ownership_NONE 39498 non-null int64home_ownership_OTHER 39498 non-null int64home_ownership_OWN 39498 non-null int64home_ownership_RENT 39498 non-null int64verification_status_Not Verified 39498 non-null int64verification_status_Source Verified 39498 non-null int64verification_status_Verified 39498 non-null int64purpose_car 39498 non-null int64purpose_credit_card 39498 non-null int64purpose_debt_consolidation 39498 non-null int64purpose_educational 39498 non-null int64purpose_home_improvement 39498 non-null int64purpose_house 39498 non-null int64purpose_major_purchase 39498 non-null int64purpose_medical 39498 non-null int64purpose_moving 39498 non-null int64purpose_other 39498 non-null int64purpose_renewable_energy 39498 non-null int64purpose_small_business 39498 non-null int64purpose_vacation 39498 non-null int64purpose_wedding 39498 non-null int64term_ 36 months 39498 non-null int64term_ 60 months 39498 non-null int64dtypes: float64(12), int64(25)memory usage: 11.1 MBNone
#使用逻辑回归来分析数据,逻辑回归是一个非常经典的二分类from sklearn.linear_model import LogisticRegressionfrom sklearn.cross_validation import cross_val_predict,KFoldlr = LogisticRegression()cols = loans.columnstrain_cols = cols.drop("loan_status")features = loans[train_cols]target = loans["loan_status"]kf = KFold(features.shape[0],random_state=1)predictions = cross_val_predict(lr,features,target,cv=kf)predictions = pd.Series(predictions)
#False positivefp_filter = (predictions == 1) & (loans["loan_status"]==0)fp = len(predictions[fp_filter])#True Positivetp_filter = (predictions == 1) & (loans["loan_status"]==1)tp = len(predictions[tp_filter])#False negativefn_filter = (predictions == 0) & (loans["loan_status"]==1)fn = len(predictions[fn_filter])#True negativetn_filter = (predictions == 0) & (loans["loan_status"]==0)tn = len(predictions[tn_filter])#Rate:True Positive很高,因为我们能赚到,但是False positive的也很高,我们亏本的概率也非常高#经分析是因为来的人我们几乎都借钱给它了,是因为我们的数据样本不平衡#解决方法:1.数据增强,增加一些没有借钱给他的数据案例,可以自己制造,也可以自己去收集tpr = tp/float((tp+fn))fpr = fp/float((fp+fn))print tprprint fprprint predictions[:20]
0.9991435068960.9948727015560 11 12 13 14 15 16 17 18 19 110 111 112 113 114 115 116 117 118 119 1dtype: int64
#指定参数,调整正负样本的权重lr = LogisticRegression(class_weight='balanced')kf = KFold(features.shape[0],random_state=1)predictions = cross_val_predict(lr,features,target,cv=kf)predictions = pd.Series(predictions)#False positivefp_filter = (predictions == 1) & (loans["loan_status"]==0)fp = len(predictions[fp_filter])#True Positivetp_filter = (predictions == 1) & (loans["loan_status"]==1)tp = len(predictions[tp_filter])#False negativefn_filter = (predictions == 0) & (loans["loan_status"]==1)fn = len(predictions[fn_filter])#True negativetn_filter = (predictions == 0) & (loans["loan_status"]==0)tn = len(predictions[tn_filter])tpr = tp/float((tp+fn))fpr = fp/float((fp+fn))#经权重项的调整后,训练的模型更加有意义,但是tpr不够高,fpr也不够低print tprprint fprprint predictions[:20]
0.6703682920350.16746233030 11 02 03 04 15 06 07 08 09 010 111 012 113 114 015 016 117 118 119 0dtype: int64
#以上是使用库函数自带的权重,我们也可以自己指定权重penalty = { 0:5, 1:1}lr = LogisticRegression(class_weight=penalty)kf = KFold(features.shape[0],random_state=1)predictions = cross_val_predict(lr,features,target,cv=kf)predictions = pd.Series(predictions)#False positivefp_filter = (predictions == 1) & (loans["loan_status"]==0)fp = len(predictions[fp_filter])#True Positivetp_filter = (predictions == 1) & (loans["loan_status"]==1)tp = len(predictions[tp_filter])#False negativefn_filter = (predictions == 0) & (loans["loan_status"]==1)fn = len(predictions[fn_filter])#True negativetn_filter = (predictions == 0) & (loans["loan_status"]==0)tn = len(predictions[tn_filter])tpr = tp/float((tp+fn))fpr = fp/float((fp+fn))#发现权重项对我们最终的结果影响很大,实际情况中根据需求自己调整权重print tprprint fprprint predictions[:20]
0.7186863167840.2156620553360 11 02 03 14 15 06 07 08 09 010 111 012 113 114 015 016 117 118 119 0dtype: int64
#使用随机森林进行分析#以上是使用库函数自带的权重,我们也可以自己指定权重from sklearn.ensemble import RandomForestClassifierlr = RandomForestClassifier(class_weight="balanced",random_state=1)kf = KFold(features.shape[0],random_state=1)predictions = cross_val_predict(lr,features,target,cv=kf)predictions = pd.Series(predictions)#False positivefp_filter = (predictions == 1) & (loans["loan_status"]==0)fp = len(predictions[fp_filter])#True Positivetp_filter = (predictions == 1) & (loans["loan_status"]==1)tp = len(predictions[tp_filter])#False negativefn_filter = (predictions == 0) & (loans["loan_status"]==1)fn = len(predictions[fn_filter])#True negativetn_filter = (predictions == 0) & (loans["loan_status"]==0)tn = len(predictions[tn_filter])tpr = tp/float((tp+fn))fpr = fp/float((fp+fn))#发现使用随机森林效果也不是很好print tprprint fprprint predictions[:20]
0.9738621932130.8570505572610 11 12 13 14 15 06 17 18 19 110 111 112 113 114 115 116 117 118 119 0dtype: int64
#使用随机森林进行分析#将树的数量增加到10颗from sklearn.ensemble import RandomForestClassifierlr = RandomForestClassifier(n_estimators=10,class_weight="balanced",random_state=1)kf = KFold(features.shape[0],random_state=1)predictions = cross_val_predict(lr,features,target,cv=kf)predictions = pd.Series(predictions)#False positivefp_filter = (predictions == 1) & (loans["loan_status"]==0)fp = len(predictions[fp_filter])#True Positivetp_filter = (predictions == 1) & (loans["loan_status"]==1)tp = len(predictions[tp_filter])#False negativefn_filter = (predictions == 0) & (loans["loan_status"]==1)fn = len(predictions[fn_filter])#True negativetn_filter = (predictions == 0) & (loans["loan_status"]==0)tn = len(predictions[tn_filter])tpr = tp/float((tp+fn))fpr = fp/float((fp+fn))#发现效果也不是很好print tprprint fprprint predictions[:20]
0.9738621932130.8570505572610 11 12 13 14 15 06 17 18 19 110 111 112 113 114 115 116 117 118 119 0dtype: int64
#实际中:换算法模型,去掉一些特征,生成一些新的特征,调模型的参数,比如权重等来实现更好的效果
0 0
- 机器学习项目实战之贷款申请最大利润
- 机器学习项目实战之贝叶斯垃圾邮件分类
- 机器学习项目实战之用户流失预警
- 机器学习实战之决策树
- 机器学习实战之KMeans
- 机器学习实战之PCA
- 机器学习实战之决策树
- 机器学习实战之决策树
- 机器学习实战之kNN
- 机器学习实战之--regression
- 机器学习实战之决策树
- 机器学习实战之adaboost
- 机器学习实战之回归
- 机器学习实战之PCA
- 机器学习实战之SVD
- 机器学习实战之Apriori
- 《机器学习实战》之决策树
- 机器学习实战之CART
- 如何让webBrowser中的超链接不会在新窗口中打开?
- Charm Bracelet
- [转载]用latex写毕业论文
- 在Java7中Switch用String做参数
- 迭代加深搜索初步
- 机器学习项目实战之贷款申请最大利润
- ZenCoding 快速指南
- Java编程思想-第一章 1.5-1.8读书笔记
- 基于动态阈值检测丝网破损
- go tool vet是你的好朋友
- 揭开CSS的绝对定位真实的面纱(一)
- 关于Vim中NERDTree插件的安装与使用
- QT环境搭建: QT玩转在linux的x86平台或者ARM平台上
- java实现单链表