数据分析类问题的框架
导入数据
python
import pandas as pd
import numpy as np
train_data=pd.read_csv('train.csv')
file_path='test.excel'
test_data=pd.read_excel(file_path)
数据预处理
缺失值
均值/中位数/众数插补法
python
from sklearn.imput import SimpleImputer
imputer=SimpleImputer(strategy='mean')
X_filled=imputer.fit_transform(X)#’mean’均值、'median’中位数、'most_frequent’众数
插值法
python
import numpy as np
import pandas as pd
from scipy.interpolate import lagrange
# 示例数据
data = pd.Series([1, 2, np.nan, 4, np.nan, 6, 7, np.nan, 9])
# 标记异常值(NaN)
is_anomaly = data.isna()
# 获取非异常值的索引和值
valid_index = data.index[~is_anomaly]
valid_values = data[~is_anomaly]
# 创建拉格朗日插值函数
lagrange_func = lagrange(valid_index, valid_values)
# 使用拉格朗日插值函数填充异常值
data_interpolated = data.copy()
data_interpolated[is_anomaly] = lagrange_func(data.index[is_anomaly])
print("原始数据:")
print(data)
print("\n拉格朗日插值后的数据:")
print(data_interpolated)
异常值
其他处理可类似缺失值
检测异常值
3sigma原则
python
lower_bound = mean - 3 * std
upper_bound = mean + 3 * std#std是方差
outliers = data[(data < lower_bound) | (data > upper_bound)]
数据增强
归一化标准化
目的是消除特征之间的量纲差异
python
import numpy as np
# 均值-方差归一化
def standardization(X):
X_mean = np.mean(X)
X_std = np.std(X)
X_new = (X - X_mean) / X_std
return X_new
# 最小-最大规范化
def min_max_normalization(X):
X_min = np.min(X)
X_max = np.max(X)
X_new = (X - X_min) / (X_max - X_min)
return X_new
python
from sklearn.preprocessing import MinMaxScale
scaler=MinMaxScaler()
X_scaler=scaler.fit_transform(X)
from sklearn.preprocessing import StandardScaler
scaler=Standardscaler
X_scaler=scaler.fit_transform(x)
确定机器学习的模型
线性模型
树模型
(XGBoost/lightgbm...)
XGBoost Parameters — xgboost 2.1.3 documentation
神经网络
python
#以mlp为例
#模型
class Model(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(HousePriceModel, self).__init__()
self.linear1 = nn.Linear(input_size, hidden_size)
self.linear2 = nn.Linear(hidden_size, hidden_size)
self.linear3 = nn.Linear(hidden_size, output_size)
self.relu = nn.ReLU()
def forward(self, x):
x = self.relu(self.linear1(x))
x = self.relu(self.linear2(x))
x = self.linear3(x)
return x
# 初始化模型
input_size = X_train_leaves.shape[1]
hidden_size = 64
output_size = 1
model = HousePriceModel(input_size, hidden_size, output_size)
# 定义损失函数和优化器
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练模型
num_epochs = 20
for epoch in range(num_epochs):
model.train()
train_loss = 0
for batch_X, batch_y in train_loader:
# 前向传播
outputs = model(batch_X)
loss = criterion(outputs, batch_y)
# 反向传播和优化
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss += loss.item()
# 验证集上的损失
model.eval()
val_loss = 0
with torch.no_grad():
for batch_X, batch_y in val_loader:
outputs = model(batch_X)
val_loss += criterion(outputs, batch_y).item()
print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}')
支持向量机
SVC — scikit-learn 1.6.1 documentation
聚类
多模型
分为集成(加权平均,投票)和堆叠
调参
python
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
把训练数据集部分分出来为验证数据集,由一步的评估指标进行对参数的的优化
模型评估指标
准确率,召回率,混淆矩阵
K折交叉验证