import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

path = 'logistic_regression_data1.txt'
data = pd.read_csv(path, header=None, names=['Investment Risk', 'Duration Risk', 'Project Risk'])
data.head()

positive = data[data['Project Risk'].isin([1])]
negative = data[data['Project Risk'].isin([0])]

fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive['Investment Risk'], positive['Duration Risk'], s=50, c='b', marker='o', label='High Risk')
ax.scatter(negative['Investment Risk'], negative['Duration Risk'], s=50, c='r', marker='x', label='Low Risk')
ax.legend()
ax.set_xlabel('Investment Risk Score')
ax.set_ylabel('Duration Risk Score')
plt.show()

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

nums = np.arange(-10, 10, step=1)

fig, ax = plt.subplots(figsize=(12,8))
ax.plot(nums, sigmoid(nums), 'r')
plt.show()

def cost(theta, X, y):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
    second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
    return np.sum(first - second) / (len(X))

# add a ones column - this makes the matrix multiplication work out easier
data.insert(0, 'Ones', 1)

# set X (training data) and y (target variable)
cols = data.shape[1]
X = data.iloc[:,0:cols-1]
y = data.iloc[:,cols-1:cols]

# convert to numpy arrays and initalize the parameter array theta
X = np.array(X.values)
y = np.array(y.values)
theta = np.zeros(3)

theta

array([0., 0., 0.])

X.shape, theta.shape, y.shape

((100, 3), (3,), (100, 1))

cost(theta, X, y)

0.6931471805599453

def gradient(theta, X, y):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    
    parameters = int(theta.ravel().shape[1])
    grad = np.zeros(parameters)
    
    error = sigmoid(X * theta.T) - y
    
    for i in range(parameters):
        term = np.multiply(error, X[:,i])
        grad[i] = np.sum(term) / len(X)
    
    return grad

import scipy.optimize as opt
result = opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(X, y))
result

(array([-25.16131865,   0.20623159,   0.20147149]), 36, 0)

cost(result[0], X, y)

0.20349770158947447

def predict(theta, X):
    probability = sigmoid(X * theta.T)
    return [1 if x >= 0.5 else 0 for x in probability]

theta_min = np.matrix(result[0])
predictions = predict(theta_min, X)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y)]
accuracy = (sum(map(int, correct)) / len(correct))
print ('accuracy = {0}%'.format(accuracy))

accuracy = 0.89%

from sklearn import linear_model#调用sklearn的线性回归包
model = linear_model.LogisticRegression()
model.fit(X, y.ravel())
model.score(X,y)

0.89

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

path =  'logistic_regression_data2.txt'
data2 = pd.read_csv(path, header=None, names=['Test 1', 'Test 2', 'Accepted'])
data2.head()

def plotData(data,X_labels,y_label):
    positive = data[data[y_label].isin([1])]
    negative = data[data[y_label].isin([0])]

    fig, ax = plt.subplots(figsize=(12,8))
    ax.scatter(positive[X_labels[0]], positive[X_labels[1]], s=50, c='b', marker='o', label='Accepted')
    ax.scatter(negative[X_labels[0]], negative[X_labels[1]], s=50, c='r', marker='x', label='Rejected')
    ax.legend()
    ax.set_xlabel(X_labels[0]+' Score')
    ax.set_ylabel(X_labels[1]+' Score')
    
plotData(data2,['Test 1','Test 2'],'Accepted')
plt.show()

def feature_mapping(x1, x2, degree):
    data=pd.DataFrame()
    for i in range(1, degree+1):
        for j in range(0, i+1):
            data['F' + str(i) + str(j)] = np.power(x1, i-j) * np.power(x2, j)
    
    return data

x1 = data2['Test 1']
x2 = data2['Test 2']

degree=6
data2_xfm=feature_mapping(x1,x2,degree)
data2_xfm.head()

def costReg(theta, X, y, learningRate):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
    second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
    reg = (learningRate / (2 * len(X))) * np.sum(np.power(theta[:,1:theta.shape[1]], 2))
    return np.sum(first - second) / len(X) + reg

def gradientReg(theta, X, y, learningRate):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    
    parameters = int(theta.ravel().shape[1])
    grad = np.zeros(parameters)
    
    error = sigmoid(X * theta.T) - y
    
    for j in range(parameters):
        term = np.multiply(error, X[:,j])
        
        if (j == 0):
            grad[j] = np.sum(term) / len(X)
        else:
            grad[j] = (np.sum(term) / len(X)) + ((learningRate / len(X)) * theta[:,j])
    
    return grad

# add a ones column - this makes the matrix multiplication work out easier
data2_xfm.insert(0, 'Ones', 1)

# set X (training data) and y (target variable)
X2 = data2_xfm
cols = data2.shape[1]
y2 = data2.iloc[:,cols-1:cols]

# convert to numpy arrays and initalize the parameter array theta
X2 = np.array(X2.values)
y2 = np.array(y2.values)
theta2 = np.zeros(((degree+1)*(degree+2))//2) #补充特征后的数据维度

learningRate = 1

costReg(theta2, X2, y2, learningRate)

0.6931471805599454

gradientReg(theta2, X2, y2, learningRate)

C:\Users\shen\AppData\Local\Temp\ipykernel_7908\2775230674.py:17: DeprecationWarning: Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)
  grad[j] = (np.sum(term) / len(X)) + ((learningRate / len(X)) * theta[:,j])

array([8.47457627e-03, 1.87880932e-02, 7.77711864e-05, 5.03446395e-02,
       1.15013308e-02, 3.76648474e-02, 1.83559872e-02, 7.32393391e-03,
       8.19244468e-03, 2.34764889e-02, 3.93486234e-02, 2.23923907e-03,
       1.28600503e-02, 3.09593720e-03, 3.93028171e-02, 1.99707467e-02,
       4.32983232e-03, 3.38643902e-03, 5.83822078e-03, 4.47629067e-03,
       3.10079849e-02, 3.10312442e-02, 1.09740238e-03, 6.31570797e-03,
       4.08503006e-04, 7.26504316e-03, 1.37646175e-03, 3.87936363e-02])

result2 = opt.fmin_tnc(func=costReg, x0=theta2, fprime=gradientReg, args=(X2, y2, learningRate))
result2

C:\Users\shen\AppData\Local\Temp\ipykernel_7908\2775230674.py:17: DeprecationWarning: Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)
  grad[j] = (np.sum(term) / len(X)) + ((learningRate / len(X)) * theta[:,j])

(array([ 1.27271027,  0.62529965,  1.18111686, -2.01987399, -0.9174319 ,
        -1.43166929,  0.12393227, -0.36553118, -0.35725403, -0.17516292,
        -1.45817009, -0.05098418, -0.61558552, -0.27469165, -1.19271298,
        -0.24217841, -0.20603297, -0.04466178, -0.27778952, -0.29539513,
        -0.45645981, -1.04319155,  0.02779373, -0.29244872,  0.01555761,
        -0.32742406, -0.1438915 , -0.92467487]),
 32,
 1)

theta_min2 = np.matrix(result2[0])
predictions = predict(theta_min2, X2)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y2)]
accuracy = (sum(map(int, correct)) / len(correct))
print ('accuracy = {0}%'.format(accuracy))

accuracy = 0.8305084745762712%

from sklearn import linear_model#调用sklearn的线性回归包
model = linear_model.LogisticRegression(penalty='l2', C=1.0)
model.fit(X2, y2.ravel())

LogisticRegression()

LogisticRegression()

model.score(X2, y2)

0.8305084745762712

def find_decision_boundary(density, degree, theta, threshhold,cord_bounds):
    t1 = np.linspace(cord_bounds[0], cord_bounds[1], density)
    t2 = np.linspace(cord_bounds[2], cord_bounds[3], density)

    cordinates = [(x, y) for x in t1 for y in t2]
    x_cord, y_cord = zip(*cordinates)
    mapped_cord = feature_mapping(x_cord, y_cord, degree)  
    mapped_cord.insert(0, 'Ones', 1)

    inner_product = np.matrix(mapped_cord) * theta.T

    decision = mapped_cord[np.abs(inner_product) < threshhold]

    return decision.F10, decision.F11

def draw_boundary(data,degree,theta,X_labels,y_label,cord_bounds):
    density = 1000
    threshhold = 2 * 10**-3

    x, y = find_decision_boundary(density, degree, theta, threshhold,cord_bounds)

    positive = data[data[y_label].isin([1])]
    negative = data[data[y_label].isin([0])]

    plotData(data,X_labels,y_label)
    plt.scatter(x, y, c='green', s=10)
    plt.title('Decision boundary')
  
draw_boundary(data2,degree,theta_min2,['Test 1','Test 2'],'Accepted',[-1,1.5,-1,1.5])
plt.show()

draw_boundary(data,1,theta_min,['Investment Risk','Duration Risk'],'Project Risk',[30,100,30,100])
plt.show()

	Investment Risk	Duration Risk	Project Risk
0	34.623660	78.024693	0
1	30.286711	43.894998	0
2	35.847409	72.902198	0
3	60.182599	86.308552	1
4	79.032736	75.344376	1

	Test 1	Test 2	Accepted
0	0.051267	0.69956	1
1	-0.092742	0.68494	1
2	-0.213710	0.69225	1
3	-0.375000	0.50219	1
4	-0.513250	0.46564	1

	F10	F11	F20	F21	F22	F30	F31	F32	F33	F40	...	F53	F54	F55	F60	F61	F62	F63	F64	F65	F66
0	0.051267	0.69956	0.002628	0.035864	0.489384	0.000135	0.001839	0.025089	0.342354	0.000007	...	0.000900	0.012278	0.167542	1.815630e-08	2.477505e-07	0.000003	0.000046	0.000629	0.008589	0.117206
1	-0.092742	0.68494	0.008601	-0.063523	0.469143	-0.000798	0.005891	-0.043509	0.321335	0.000074	...	0.002764	-0.020412	0.150752	6.362953e-07	-4.699318e-06	0.000035	-0.000256	0.001893	-0.013981	0.103256
2	-0.213710	0.69225	0.045672	-0.147941	0.479210	-0.009761	0.031616	-0.102412	0.331733	0.002086	...	0.015151	-0.049077	0.158970	9.526844e-05	-3.085938e-04	0.001000	-0.003238	0.010488	-0.033973	0.110047
3	-0.375000	0.50219	0.140625	-0.188321	0.252195	-0.052734	0.070620	-0.094573	0.126650	0.019775	...	0.017810	-0.023851	0.031940	2.780914e-03	-3.724126e-03	0.004987	-0.006679	0.008944	-0.011978	0.016040
4	-0.513250	0.46564	0.263426	-0.238990	0.216821	-0.135203	0.122661	-0.111283	0.100960	0.069393	...	0.026596	-0.024128	0.021890	1.827990e-02	-1.658422e-02	0.015046	-0.013650	0.012384	-0.011235	0.010193

Logistic Regression of Round Cat Construction Ltd.¶

sigmoid 函数¶

gradient descent(梯度下降)¶

Logistic回归——非线性边界与正则化¶

regularized cost（正则化代价函数）¶