机器学习项目实战之贷款申请最大利润

来源：互联网发布：hanlp分词算法编辑：程序博客网时间：2024/06/10 03:17

import pandas as pdloans_2007 = pd.read_csv("LoanStats3a.csv",skiprows=1)#数据清理过滤无用特征 half_count = len(loans_2007)/2loans_2007 = loans_2007.dropna(thresh=half_count,axis=1)loans_2007 = loans_2007.drop(['desc','url'],axis=1)loans_2007.to_csv("D:\test\machineLearning\loans_2007.csv",index=False)

import pandas as pdloans_2007 = pd.read_csv("D:\\test\\machineLearning\\loans_2007.csv")loans_2007.drop_duplicates()print (loans_2007.iloc[0])print (loans_2007.shape[1])

id                                1077501member_id                      1.2966e+06loan_amnt                            5000funded_amnt                          5000funded_amnt_inv                      4975term                            36 monthsint_rate                           10.65%installment                        162.87grade                                   Bsub_grade                              B2emp_title                             NaNemp_length                      10+ yearshome_ownership                       RENTannual_inc                          24000verification_status              Verifiedissue_d                          Dec-2011loan_status                    Fully Paidpymnt_plan                              npurpose                       credit_cardtitle                            Computerzip_code                            860xxaddr_state                             AZdti                                 27.65delinq_2yrs                             0earliest_cr_line                 Jan-1985inq_last_6mths                          1open_acc                                3pub_rec                                 0revol_bal                           13648revol_util                          83.7%total_acc                               9initial_list_status                     fout_prncp                               0out_prncp_inv                           0total_pymnt                       5863.16total_pymnt_inv                   5833.84total_rec_prncp                      5000total_rec_int                      863.16total_rec_late_fee                      0recoveries                              0collection_recovery_fee                 0last_pymnt_d                     Jan-2015last_pymnt_amnt                    171.62last_credit_pull_d               Nov-2016collections_12_mths_ex_med              0policy_code                             1application_type               INDIVIDUALacc_now_delinq                          0chargeoff_within_12_mths                0delinq_amnt                             0pub_rec_bankruptcies                    0tax_liens                               0Name: 0, dtype: object52

#数据预处理loans_2007 = loans_2007.drop(['id','member_id','funded_amnt','funded_amnt_inv','grade','sub_grade','emp_title','last_pymnt_d','last_pymnt_amnt'],axis=1)loans_2007 = loans_2007.drop(['zip_code','out_prncp','out_prncp_inv','total_pymnt','total_pymnt_inv','total_rec_prncp'],axis=1)loans_2007 = loans_2007.drop(['total_rec_int','total_rec_late_fee','recoveries','collection_recovery_fee','issue_d'],axis=1)print (loans_2007.iloc[0])print (loans_2007.shape[1])

loan_amnt                            5000term                            36 monthsint_rate                           10.65%installment                        162.87emp_length                      10+ yearshome_ownership                       RENTannual_inc                          24000verification_status              Verifiedloan_status                    Fully Paidpymnt_plan                              npurpose                       credit_cardtitle                            Computeraddr_state                             AZdti                                 27.65delinq_2yrs                             0earliest_cr_line                 Jan-1985inq_last_6mths                          1open_acc                                3pub_rec                                 0revol_bal                           13648revol_util                          83.7%total_acc                               9initial_list_status                     flast_credit_pull_d               Nov-2016collections_12_mths_ex_med              0policy_code                             1application_type               INDIVIDUALacc_now_delinq                          0chargeoff_within_12_mths                0delinq_amnt                             0pub_rec_bankruptcies                    0tax_liens                               0Name: 0, dtype: object32

#loan_status是当前贷款的状态 print (loans_2007["loan_status"].value_counts())

Fully Paid                                             33902Charged Off                                             5658Does not meet the credit policy. Status:Fully Paid      1988Does not meet the credit policy. Status:Charged Off      761Current                                                  201Late (31-120 days)                                        10In Grace Period                                            9Late (16-30 days)                                          5Default                                                    1Name: loan_status, dtype: int64

#Fully Paid代表已放款，Charged Off代表拒贷,进行二分类loans_2007 = loans_2007[(loans_2007['loan_status']=='Fully Paid') | (loans_2007['loan_status']=='Charged Off')]#将字符串转化成数字status_replace = {    'loan_status':{        'Fully Paid':1,        'Charged Off':0,    }}#将要替换的做成字典，key是对应的列loans_2007 = loans_2007.replace(status_replace)

orig_columns = loans_2007.columnsdrop_columns = []for col in orig_columns:    col_series = loans_2007[col].dropna().unique()    if len(col_series) == 1:        #如果某一列都是一种值，也将其去掉        drop_columns.append(col)loans_2007 = loans_2007.drop(drop_columns,axis=1)print drop_columnsprint loans_2007.shape

['initial_list_status', 'collections_12_mths_ex_med', 'policy_code', 'application_type', 'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt', 'tax_liens'](39560, 24)

#检查是否有缺失值null_count=loans_2007.isnull().sum()print null_count

loan_amnt                 0term                      0int_rate                  0installment               0emp_length                0home_ownership            0annual_inc                0verification_status       0loan_status               0pymnt_plan                0purpose                   0title                    10addr_state                0dti                       0delinq_2yrs               0earliest_cr_line          0inq_last_6mths            0open_acc                  0pub_rec                   0revol_bal                 0revol_util               50total_acc                 0last_credit_pull_d        2pub_rec_bankruptcies    697dtype: int64

loans_2007 = loans_2007.drop("pub_rec_bankruptcies",axis=1)#去掉有缺失值的行loans_2007 = loans_2007.dropna(axis=0)print loans_2007.dtypes.value_counts()

object     12float64    10int64       1dtype: int64

#从以上结果看，由于sklearn只接受数值形的数据，不接受字符，所以显示为object，我们需要将其转化为字数值object_columns_df = loans_2007.select_dtypes(include=["object"])print object_columns_df.iloc[0]

term                     36 monthsint_rate                    10.65%emp_length               10+ yearshome_ownership                RENTverification_status       Verifiedpymnt_plan                       npurpose                credit_cardtitle                     Computeraddr_state                      AZearliest_cr_line          Jan-1985revol_util                   83.7%last_credit_pull_d        Nov-2016Name: 0, dtype: object

cols = ['home_ownership','verification_status','emp_length','term','addr_state']for c in cols:    print loans_2007[c].value_counts()

RENT        18780MORTGAGE    17574OWN          3045OTHER          96NONE            3Name: home_ownership, dtype: int64Not Verified       16856Verified           12705Source Verified     9937Name: verification_status, dtype: int6410+ years    8821< 1 year     45632 years      43713 years      40744 years      34095 years      32701 year       32276 years      22127 years      17568 years      14729 years      1254n/a          1069Name: emp_length, dtype: int64 36 months    29041 60 months    10457Name: term, dtype: int64CA    7070NY    3788FL    2856TX    2714NJ    1838IL    1517PA    1504VA    1400GA    1393MA    1336OH    1208MD    1049AZ     874WA     834CO     786NC     780CT     747MI     722MO     682MN     611NV     492SC     470WI     453AL     446OR     445LA     435KY     325OK     298KS     269UT     256AR     243DC     211RI     198NM     188WV     176HI     172NH     172DE     113MT      84WY      83AK      79SD      63VT      54MS      19TN      17IN       9ID       6IA       5NE       5ME       3Name: addr_state, dtype: int64

mapping_dict = {    "emp_length":{        "10+ years":10,        "9 years":9,        "8 years":8,        "7 years":7,        "6 years":6,        "5 years":5,        "4 years":4,        "3 years":3,        "2 years":2,        "1 year":1,          "< 1 year":0,        "n/a":0    }   }loans_2007 = loans_2007.drop(["last_credit_pull_d","earliest_cr_line","addr_state","title"],axis=1)#去掉%并转化为浮点型数据loans_2007["int_rate"]=loans_2007["int_rate"].str.rstrip("%").astype("float")loans_2007["revol_util"]=loans_2007["revol_util"].str.rstrip("%").astype("float")loans_2007 = loans_2007.replace(mapping_dict)

cat_columns = ['home_ownership','verification_status','emp_length','purpose','term']dummy_df = pd.get_dummies(loans_2007[cat_columns])loans_2007 = pd.concat([loans_2007,dummy_df],axis=1)loans_2007 = loans_2007.drop(cat_columns,axis=1)

#贷款这样的项目，精度高的意义不大，因为只要亏了一笔，最终可能亏损很大#所以要考虑ROC指标#以上为数据预处理的流程，获取已经整理好的数据loans = pd.read_csv("D:\\test\\machineLearning\\cleaned_loans2007.csv")print loans.info()

<class 'pandas.core.frame.DataFrame'>RangeIndex: 39498 entries, 0 to 39497Data columns (total 37 columns):loan_amnt                              39498 non-null float64int_rate                               39498 non-null float64installment                            39498 non-null float64annual_inc                             39498 non-null float64loan_status                            39498 non-null int64dti                                    39498 non-null float64delinq_2yrs                            39498 non-null float64inq_last_6mths                         39498 non-null float64open_acc                               39498 non-null float64pub_rec                                39498 non-null float64revol_bal                              39498 non-null float64revol_util                             39498 non-null float64total_acc                              39498 non-null float64home_ownership_MORTGAGE                39498 non-null int64home_ownership_NONE                    39498 non-null int64home_ownership_OTHER                   39498 non-null int64home_ownership_OWN                     39498 non-null int64home_ownership_RENT                    39498 non-null int64verification_status_Not Verified       39498 non-null int64verification_status_Source Verified    39498 non-null int64verification_status_Verified           39498 non-null int64purpose_car                            39498 non-null int64purpose_credit_card                    39498 non-null int64purpose_debt_consolidation             39498 non-null int64purpose_educational                    39498 non-null int64purpose_home_improvement               39498 non-null int64purpose_house                          39498 non-null int64purpose_major_purchase                 39498 non-null int64purpose_medical                        39498 non-null int64purpose_moving                         39498 non-null int64purpose_other                          39498 non-null int64purpose_renewable_energy               39498 non-null int64purpose_small_business                 39498 non-null int64purpose_vacation                       39498 non-null int64purpose_wedding                        39498 non-null int64term_ 36 months                        39498 non-null int64term_ 60 months                        39498 non-null int64dtypes: float64(12), int64(25)memory usage: 11.1 MBNone

#使用逻辑回归来分析数据,逻辑回归是一个非常经典的二分类from sklearn.linear_model import LogisticRegressionfrom sklearn.cross_validation import cross_val_predict,KFoldlr = LogisticRegression()cols = loans.columnstrain_cols = cols.drop("loan_status")features = loans[train_cols]target = loans["loan_status"]kf = KFold(features.shape[0],random_state=1)predictions = cross_val_predict(lr,features,target,cv=kf)predictions = pd.Series(predictions)

#False positivefp_filter = (predictions == 1) & (loans["loan_status"]==0)fp = len(predictions[fp_filter])#True Positivetp_filter = (predictions == 1) & (loans["loan_status"]==1)tp = len(predictions[tp_filter])#False negativefn_filter = (predictions == 0) & (loans["loan_status"]==1)fn = len(predictions[fn_filter])#True negativetn_filter = (predictions == 0) & (loans["loan_status"]==0)tn = len(predictions[tn_filter])#Rate:True Positive很高，因为我们能赚到，但是False positive的也很高，我们亏本的概率也非常高#经分析是因为来的人我们几乎都借钱给它了，是因为我们的数据样本不平衡#解决方法:1.数据增强，增加一些没有借钱给他的数据案例，可以自己制造，也可以自己去收集tpr = tp/float((tp+fn))fpr = fp/float((fp+fn))print tprprint fprprint predictions[:20]

0.9991435068960.9948727015560     11     12     13     14     15     16     17     18     19     110    111    112    113    114    115    116    117    118    119    1dtype: int64

#指定参数，调整正负样本的权重lr = LogisticRegression(class_weight='balanced')kf = KFold(features.shape[0],random_state=1)predictions = cross_val_predict(lr,features,target,cv=kf)predictions = pd.Series(predictions)#False positivefp_filter = (predictions == 1) & (loans["loan_status"]==0)fp = len(predictions[fp_filter])#True Positivetp_filter = (predictions == 1) & (loans["loan_status"]==1)tp = len(predictions[tp_filter])#False negativefn_filter = (predictions == 0) & (loans["loan_status"]==1)fn = len(predictions[fn_filter])#True negativetn_filter = (predictions == 0) & (loans["loan_status"]==0)tn = len(predictions[tn_filter])tpr = tp/float((tp+fn))fpr = fp/float((fp+fn))#经权重项的调整后，训练的模型更加有意义，但是tpr不够高，fpr也不够低print tprprint fprprint predictions[:20]

0.6703682920350.16746233030     11     02     03     04     15     06     07     08     09     010    111    012    113    114    015    016    117    118    119    0dtype: int64

#以上是使用库函数自带的权重，我们也可以自己指定权重penalty = {    0:5,    1:1}lr = LogisticRegression(class_weight=penalty)kf = KFold(features.shape[0],random_state=1)predictions = cross_val_predict(lr,features,target,cv=kf)predictions = pd.Series(predictions)#False positivefp_filter = (predictions == 1) & (loans["loan_status"]==0)fp = len(predictions[fp_filter])#True Positivetp_filter = (predictions == 1) & (loans["loan_status"]==1)tp = len(predictions[tp_filter])#False negativefn_filter = (predictions == 0) & (loans["loan_status"]==1)fn = len(predictions[fn_filter])#True negativetn_filter = (predictions == 0) & (loans["loan_status"]==0)tn = len(predictions[tn_filter])tpr = tp/float((tp+fn))fpr = fp/float((fp+fn))#发现权重项对我们最终的结果影响很大，实际情况中根据需求自己调整权重print tprprint fprprint predictions[:20]

0.7186863167840.2156620553360     11     02     03     14     15     06     07     08     09     010    111    012    113    114    015    016    117    118    119    0dtype: int64

#使用随机森林进行分析#以上是使用库函数自带的权重，我们也可以自己指定权重from sklearn.ensemble import RandomForestClassifierlr = RandomForestClassifier(class_weight="balanced",random_state=1)kf = KFold(features.shape[0],random_state=1)predictions = cross_val_predict(lr,features,target,cv=kf)predictions = pd.Series(predictions)#False positivefp_filter = (predictions == 1) & (loans["loan_status"]==0)fp = len(predictions[fp_filter])#True Positivetp_filter = (predictions == 1) & (loans["loan_status"]==1)tp = len(predictions[tp_filter])#False negativefn_filter = (predictions == 0) & (loans["loan_status"]==1)fn = len(predictions[fn_filter])#True negativetn_filter = (predictions == 0) & (loans["loan_status"]==0)tn = len(predictions[tn_filter])tpr = tp/float((tp+fn))fpr = fp/float((fp+fn))#发现使用随机森林效果也不是很好print tprprint fprprint predictions[:20]

0.9738621932130.8570505572610     11     12     13     14     15     06     17     18     19     110    111    112    113    114    115    116    117    118    119    0dtype: int64

#使用随机森林进行分析#将树的数量增加到10颗from sklearn.ensemble import RandomForestClassifierlr = RandomForestClassifier(n_estimators=10,class_weight="balanced",random_state=1)kf = KFold(features.shape[0],random_state=1)predictions = cross_val_predict(lr,features,target,cv=kf)predictions = pd.Series(predictions)#False positivefp_filter = (predictions == 1) & (loans["loan_status"]==0)fp = len(predictions[fp_filter])#True Positivetp_filter = (predictions == 1) & (loans["loan_status"]==1)tp = len(predictions[tp_filter])#False negativefn_filter = (predictions == 0) & (loans["loan_status"]==1)fn = len(predictions[fn_filter])#True negativetn_filter = (predictions == 0) & (loans["loan_status"]==0)tn = len(predictions[tn_filter])tpr = tp/float((tp+fn))fpr = fp/float((fp+fn))#发现效果也不是很好print tprprint fprprint predictions[:20]

0.9738621932130.8570505572610     11     12     13     14     15     06     17     18     19     110    111    112    113    114    115    116    117    118    119    0dtype: int64

#实际中:换算法模型，去掉一些特征，生成一些新的特征，调模型的参数，比如权重等来实现更好的效果

0 0