# Import package # Used to load the Boston housing price data set from sklearn.datasets import load_boston # pandas toolkit If you are unfamiliar with pandas, you can refer to the official 10-minute tutorial: https://pandas.pydata.org/pandas-docs/stable/10min.html import pandas as pd import numpy as np # seaborn for drawing import seaborn as sns import matplotlib.pyplot as plt # Show drawing %matplotlib inline
data = load_boston() # load datase
data.keys() # Fields inside data
df = pd.DataFrame(data['data'])
# Looking at the first 5 rows of the dataframe, we can see that the column names are numbers df.head(5)
data['feature_names'] # Feature name
The Table params and chinese
info
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
params chinese info CRIM 住房所在城镇的人均犯罪率 ZN 住房用地超过 25000 平方尺的比例 INDUS 住房所在城镇非零售商用土地的比例 CHAS 有关查理斯河的虚拟变量(如果住房位于河边则为1,否则为0 ) NOX 一氧化氮浓度 RM 每处住房的平均房间数 AGE 建于 1940 年之前的业主自住房比例 DIS 住房距离波士顿五大中心区域的加权距离 RAD 离住房最近的公路入口编号 TAX 每 10000 美元的全额财产税金额 PTRATIO 住房所在城镇的师生比例 B1000(Bk-0.63)^2,其中 Bk 指代城镇中黑人的比例 LSTAT 弱势群体人口所占比例 MEDV 业主自住房的中位数房价(以千美元计)
# The target is the house price, which is also our target value. We assign the target value to the dataframe df['price'] = data['target'] df.head(5)
# View the correlation coefficient between the feature and price, positive correlation and negative correlation sns.heatmap(df.corr(), annot=True, fmt='.1f')
plt.scatter(df['RM'], df['price'])
plt.figure(figsize=(20, 5))
# View the data distribution display of some features and price features = ['LSTAT', 'RM'] target = df['price']
for i, col inenumerate(features): plt.subplot(1, len(features), i+1) x = df[col] y = target plt.scatter(x, y, marker = 'o') plt.title('{} price'.format(col)) plt.xlabel(col) plt.ylabel('price')
# Simple example: univariate forecast price x = df['RM'] y = df['price']
history_notes = {_x: _y for _x, _y inzip(x,y)}
history_notes[6.575]
# Find the top three prices that are closest to RM:6.57, similary_ys = [y for _, y insorted(history_notes.items(), key=lambda x_y: (x_y[0] - 6.57) ** 2)[:3]] similary_ys
# Calculate the average of three np.mean(similary_ys)
Use
historical data to predict data that has never been seen before, the
most direct method
K-Neighbor-Nearst
1 2 3 4 5 6 7 8
defknn(query_x, history, top_n = 3): sorted_notes = sorted(history.items(), key = lambda x_y: (x_y[0] - query_x)**2) similar_notes = sorted_notes[:top_n] similar_ys = [y for _, y in similar_notes]
return np.mean(similar_ys)
knn(5.4, history_notes)
In order to obtain results faster, we hope to obtain predictive power
by fitting a function
df.describe() # Data description, you can view the statistics of each variable
3. Data preprocessing
Normalization or standardization can prevent a certain dimension or a
few dimensions from affecting the data too much when there are very many
dimensions, and secondly, the program can run faster. There are many
methods, such as standardization, min-max, z-score, p-norm, etc. How to
use it depends on the characteristics of the data set.
from sklearn.preprocessing import StandardScaler # z = (x-u) / s u is the mean, s is the standard deviation ss = StandardScaler() data_train = ss.fit_transform(data_train) # For linear models, normalization or standardization is generally required, otherwise gradient explosion will occur, and tree models are generally not required data_train = pd.DataFrame(data_train, columns = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT']) data_train.describe()
# y=Σwixi+ # Because the derivation of b is all 1, add a bias b to the data and set it to 1, as a feature of the data and update the gradient wi*b=wi data_train['bias'] = 1 data_train
Divide the data set, where 20% of the data is used as the test set
X_test, y_test, and the other 80% are used as the training set X_train,
y_train, where random_state is the random seed
defl1_cost(x, y, theta): """ x: 特征 y: 目标值 thta: 模型参数 """ k = x.shape[0] total_cost = 0 for i inrange(k): total_cost += 1/k * np.abs(y[i] -theta.dot(x[i, :])) return total_cost
defl2_cost(x, y, theta): k = x.shape[0] total_cost = 0 for i inrange(k): total_cost += 1/k * (y[i] -theta.dot(x[i,:])) ** 2 return total_cost
np.zeros(10).shape
defstep_l1_gradient(x, y, learning_rate, theta): """ Function to calculate the gradient of the MAE loss function Return the gradient value 0 for the non-differentiable point at 0 X:特征向量 y:目标值 learing_rate:学习率 theta:参数 """ n = x.shape[0] # print(n) e = y - x @ theta gradients = - (x.T @ np.sign(e)) / n # sign is a sign function thata = theta - learning_rate * gradients return theta
defstep_l2_gradient(x, y, learning_rate, theta): k = x.shape[0] n = x.shape[1] gradients = np.zeros(n) for i inrange(k): for j inrange(n): gradients[j] += (-2/k) * (y[i] - (theta.dot(x[i, :]))) * x[i, j] theta = theta - learning_rate * gradient return theta
# def step_gradient(X, y, learning_rate, theta): # """ # X:特征向量 # y:目标值 # learing_rate:学习率 # theta:参数 # """ # m_deriv = 0 # N = len(X) # for i in range(N): # # 计算偏导 # # -x(y - (mx + b)) / |mx + b| # m_deriv += - X[i] * (y[i] - (theta*X[i] + b)) / abs(y[i] - (theta*X[i] + b)) # # We subtract because the derivatives point in direction of steepest ascent # theta -= (m_deriv / float(N)) * learning_rate # # theta = theta - learning_rate * gradients # return theta
defgradient_descent(train_x, train_y, learning_rate, iterations): k = train_x.shape[0] n = train_x.shape[1] theta = np.zeros(n) # Initialization parameters
loss_values = [] # print(theta.shape)
for i inrange(iterations): theta = step_l1_gradient(train_x, train_y, learning_rate, theta) loss = l1_cost(train_x, train_y, theta) loss_values.append(loss) print(i, 'cost:', loss) return theta, loss_values
# Training parameters learning_rate = 0.04# Learning rate iterations = 300# Number of iterations theta, loss_values = gradient_descent(train_x, train_y, learning_rate, iterations)