Logistic回归---从疝气病症预测马的死亡率

来源:互联网 发布:做淘宝一个月赚10万 编辑:程序博客网 时间:2024/06/10 05:39
#!/usr/bin/python  # -*- coding: utf-8 -*-  #coding=utf-8from numpy import *#打开文本文件并逐行读取,每行前两个值为x1,x2,第3列为类别标签#为方便计算,将x0设为1.0def loadDataSet():    dataMat = []; labelMat = []    fr = open('testSet.txt')    for line in fr.readlines():        lineArr = line.split('\t')        dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])        labelMat.append(int(lineArr[2]))    return dataMat, labelMatdef sigmoid(inX):    return 1.0/(1 + exp(-inX))#改进的随机梯度上升算法def stocGradAscent(dataMatrix, classLabel, numIter = 150):    m, n = shape(dataMatrix)    weights = ones(n)  #回归系数值初始化为1    for j in range(numIter):        dataIndex = range(m)        for i in range(m):            alpha = 4/(1.0 + j + i) + 0.0001  #alpha每次迭代是需要调整,不断减小,但不会到0,缓解数据波动            randIndex = int(random.uniform(0, len(dataIndex)))  #随机选取更新            h = sigmoid(sum(dataMatrix[randIndex] * weights))            error = classLabel[randIndex] - h            weights = weights + alpha * error * dataMatrix[randIndex]  #用alpha*gradient更新回归系数值,随机选取样本,减少周期性波动            del(dataIndex[randIndex])    return weights#测试算法,用Logistic回归进行分类#输入参数为回归系数和特征向量def classifyVector(inX, weights):    prob = sigmoid(sum(inX * weights))    if prob > 0.5:        return 1.0    else:        return 0.0#打开测试集和训练集,并对数据进行格式化处理def colicTest():    frTrain = open('horseColicTraining.txt')    frTest = open('horseColicTest.txt')    trainingSet = []; trainingLabels = []    for line in frTrain.readlines():        currLine = line.strip().split('\t')        lineArr = []        for i in range(21):            lineArr.append(float(currLine[i]))        trainingSet.append(lineArr)        trainingLabels.append(float(currLine[21]))    trainWeights = stocGradAscent(array(trainingSet), trainingLabels, 1000)  #计算回归系数向量    errorCount = 0; numTestVec = 0.0    #计算测试集分类错误率    for line in frTest.readlines():        numTestVec += 1.0        currLine = line.strip().split('\t')        lineArr = []        for i in range(21):            lineArr.append(float(currLine[i]))        if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]):            errorCount += 1    errorRate = (float(errorCount)/numTestVec)    print "the error rate of this test is : %f" %errorRate    return errorRate#多次测试def multiTest():    numTests = 10; errorSum = 0.0    for k in range(numTests):        errorSum += colicTest()    print "after %d iterations the average error rate is : %f" %(numTests, errorSum/float(numTests))

测试:

>>> import Logistic>>> multiTest()the error rate of this test is : 0.358209the error rate of this test is : 0.388060the error rate of this test is : 0.343284the error rate of this test is : 0.402985the error rate of this test is : 0.402985the error rate of this test is : 0.253731the error rate of this test is : 0.388060the error rate of this test is : 0.417910the error rate of this test is : 0.298507the error rate of this test is : 0.402985after 10 iterations the average error rate is : 0.365672
0 0