蘑菇猎人
感觉都没有一些实质上的东西,就把woc的搬出来,也算是给自己留点印象吧,毕竟众所周知ml很多代码可复用性都好高(手动狗头
前言:我是菜鸡,不要喷我,这是用lightgbm+mlp完成的简单玩具,几个小时能解决的(当然你不会用ai or 你是大佬 当我没说
库
import pandas as pd
import numpy as np
import sklearn
import lightgbm as lgb
import torch.nn as nn
import torch
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
ml式处理
话说代码框如果可以加粗就太好了
因为一开始看到这个数据集的时候我是打算直接用lightgbm处理的,虽然数据集足够大让我用dl的方法)所以一开始是用ordinalencoder来做处理,lightgbm对缺失值并不敏感,性能很好,这让人很高兴,后续的concat,drop,切片等步骤是基本的拆成特征,训练目标,而split就是纯纯分割出验证集出来,我记得好像是30w量的数据了,所以0.5对训练集来说甚至可能多了?
Xy_train=pd.read_csv('train.csv')#数据预处理
X_test=pd.read_csv('test.csv')
Xy_all=pd.concat([Xy_train,X_test],axis=0)
cat_features = Xy_all.columns[Xy_all.dtypes=='object']
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder(
dtype=np.int32,
handle_unknown='use_encoded_value',
unknown_value=-1,
encoded_missing_value=-1,
).set_output(transform="pandas")#set_output的API接口
ordinal_encoder.fit_transform(Xy_all[cat_features])
Xy_all[cat_features]=ordinal_encoder.fit_transform(Xy_all[cat_features])
X_test=Xy_all[Xy_all["class"]==-1].drop(columns=["class"])#测试数据
Xy_train=Xy_all[Xy_all["class"]!=-1]
X_train=Xy_train.drop(columns=["class"])#特征
y_train=Xy_train["class"]#目标
from sklearn.model_selection import train_test_split#数据分割
X1_train,X1_test,y1_train,y1_test=train_test_split(X_train,y_train,test_size=0.5,random_state=42,stratify=y_train)#X1_train是训练数据,X1_test是验证数据,X_test是测试数据
ml
这里是用lightgbm提取特征的步骤,当然如果直接这一步后跳过dl也可以达到很不错的成绩(我尝试过grid调参,可惜懒得等就放弃了,手动狗头),定义lightgbm模型,然后fit喂数据,提取特征这步很神奇是不是让上面的步骤有点太多余了
def extract_features_with_lightgbm(X_train, y_train, X_val, X_test):
# 初始化 LightGBM 模型
lgbm = lgb.LGBMClassifier(
boosting_type='gbdt',
num_leaves=31,
max_depth=-1,
learning_rate=0.08,
n_estimators=130,
subsample_for_bin=200000,
objective=None,
class_weight=None,
min_split_gain=0.0,
min_child_weight=0.001,
min_child_samples=20,
subsample=1.0,
subsample_freq=0,
colsample_bytree=1.0,
reg_alpha=0.0,
reg_lambda=0.0,
random_state=None,
n_jobs=None,
importance_type='split',)
# 训练 LightGBM 模型
lgbm.fit(X_train, y_train)
# 提取叶子节点特征
X_train_leaves = lgbm.predict(X_train, pred_leaf=True)
X_val_leaves = lgbm.predict(X_val, pred_leaf=True)
X_test_leaves = lgbm.predict(X_test, pred_leaf=True)
return X_train_leaves, X_val_leaves, X_test_leaves
X_train_leaves, X_val_leaves, X_test_leaves = extract_features_with_lightgbm(X1_train, y1_train, X1_test, X_test)
适配dl
标准化的作用emmm,因为后面用的是mlp嘛,转成tensor,float32是基本操作了
# 标准化叶子节点特征
scaler = StandardScaler()
X_train_leaves = scaler.fit_transform(X_train_leaves)
X_val_leaves = scaler.transform(X_val_leaves)
X_test_leaves = scaler.transform(X_test_leaves)
# 转换为 PyTorch 张量
X_train_tensor = torch.tensor(X_train_leaves, dtype=torch.float32)
y_train_tensor = torch.tensor(y1_train.values, dtype=torch.float32).view(-1, 1)
X_val_tensor = torch.tensor(X_val_leaves, dtype=torch.float32)
y_val_tensor = torch.tensor(y1_test.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test_leaves, dtype=torch.float32)
# 创建数据集和数据加载器
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
dl
经典dl步骤,直接照抄即可,如果用dl2库是不是连train的代码都不需要写了?事实上我现在也不太会写
这里涉及到torch.no_grad()的问题,因为是验证集嘛,就不需要更新权重啦后面res部分的代码也是一样滴
没有使用dropout和batchnorm是因为我实在懒(布什,其实我只是因为太不扎实了又没找到之前用过的代码而已
反正准确率都99%+了,(数据实在太好啦
# 模型
class Model(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(Model, self).__init__()
self.linear1 = nn.Linear(input_size, hidden_size)
self.linear2 = nn.Linear(hidden_size, hidden_size)
self.linear3 = nn.Linear(hidden_size, output_size)
self.relu = nn.ReLU()#这几层真的是水爆了QAQ
def forward(self, x):
x = self.relu(self.linear1(x))
x = self.relu(self.linear2(x))
x = self.linear3(x)
return x
# 初始化模型(超参数)
input_size = X_train_leaves.shape[1]
hidden_size = 64
output_size = 1
model = Model(input_size, hidden_size, output_size)
# 定义损失函数和优化器
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练模型
num_epochs = 20
for epoch in range(num_epochs):
model.train()
train_loss = 0
for batch_X, batch_y in train_loader:
# 前向传播
outputs = model(batch_X)
loss = criterion(outputs, batch_y)
# 反向传播和优化
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss += loss.item()
# 验证集上的损失
model.eval()
val_loss = 0
with torch.no_grad():
for batch_X, batch_y in val_loader:
outputs = model(batch_X)
val_loss += criterion(outputs, batch_y).item()
print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}')
谁能赏赐我一块gpu?
res
看看sample.csv按他的格式来进行处理就可以了,由于是分类问题,所以多了一步用threshold处理的方法,.numpy()是输出的一种模式,因为一开始出来的是tensor嘛
# 预测
model.eval()
with torch.no_grad():
predictions = model(X_test_tensor).numpy()
# 保存预测结果
lala = pd.DataFrame({'id': X_test['id'], 'class': predictions.flatten()})
threshold = 0.5
lala['class'] = ['p' if pred > threshold else 'e' for pred in lala['class']]
lala.to_csv('lala.csv', index=False)
print("预测结果已保存到 lala.csv")