参加了一下单位组织的大数据建模比赛,记录一下,当做个笔记

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from itertools import product as prod

封装工具函数

# 数据清洗函数
def data_filter(data):
	"""
    处理缺失值,挑选特征等等
    Input: 
        data: pd.Dataframe or np.array, 原始数据
    Return:
        filted_data: pd.Dataframe or np.array, 处理后的数据
    """
    # 假设输入 Dataframe, 删除具有缺失值的数据样本
	# 数据清洗操作
	filted_data = data
	# filted_data["sample"] = filted_data["sample"].replace(np.nan, filted_data["sample"].mean()) # 把NaN替换成均值

	filted_data = filted_data.dropna()  # 删除有缺少值的行
	return filted_data

# 字符串转数值枚举
def str_enum(data):
	filted_data = data
	# 字符串映射转数值枚举
	# 把品牌字段处理为枚举值
	a = filted_data["brnd_nam"].drop_duplicates().reset_index()
	brand_dict = {}
	for i in a.index:
	    brand_dict[a["brnd_nam"][i]] = i
	filted_data["brnd_nam"] = filted_data["brnd_nam"].map(brand_dict)
	filted_data["gdr_typ_nam"].map({"资料不详":0, "男":1, "女":2})
	filted_data["dou_ca_usr"] = filted_data["dou_ca_usr"].map({"否":0, "是":1})
	
	return filted_data

# K折获取训练数据、测试数据
def K_fold_train_test(X, y, random_seed=0):
    """
    使用 K 折交叉验证获取多个训练集和测试集
    Input:
        X: np.array with shape(sample_num, feature_dim) 数据集所有样本的特征值
        y: np.array with shape(sample_num,) 数据集所有样本的对应标签
        random_seed: int 设置随机种子,决定每次随机采样是否产生相同的随机值
    Return:
        train_data: [tuple(X, y)] 返回一个 tuple 的列表,每一个 tuple 代表一个训练集
        test_data: [tuple(X, y)] 返回一个 tuple 的列表,每一个 tuple 代表一个测试集
    """
    train_data = []
    test_data = []
    
    # sklearn.model_selection.RepeatedKFold 函数
    kf = KFold(n_splits=5, shuffle=True, random_state=random_seed)
    for train_index, test_index in kf.split(X):
        train_tup = (X[train_index], y[train_index])
        test_tup = (X[test_index], y[test_index])
        train_data.append(train_tup)
        test_data.append(test_tup)
    
    return train_data, test_data


"""
模型参数处理模块
Function to convert dictionary of lists to list of dictionaries of all combinations of listed variables. 
Example:
    list_of_param_dicts({'a': [1, 2], 'b': [3, 4]}) ---> [{'a': 1, 'b': 3}, {'a': 1, 'b': 4}, {'a': 2, 'b': 3}, {'a': 2, 'b': 4}]
"""
def list_of_param_dicts(param_dict):
    """
    Arguments:
        param_dict   -(dict) dictionary of parameters
    """
    vals = list(prod(*[v for k, v in param_dict.items()]))
    keys = list(prod(*[[k]*len(v) for k, v in param_dict.items()]))
    return [dict([(k, v) for k, v in zip(key, val)]) for key, val in zip(keys, vals)]
# 训练集数据
df_train = pd.read_csv("train_set.csv")
# 测试集数据
df_test = pd.read_csv("test_set.csv")

# 训练数据删除指定的某几列
train_d=df_train.drop(labels=['usr_id',"month","cust_typ_cd"],axis=1)
train_d = str_enum(train_d)
train_d = data_filter(train_d)
train_d

# 测试数据删除指定的某几列
test_d = df_test.drop(labels=["month", "cust_typ_cd"],axis=1)
test_d = str_enum(test_d)
test_d = data_filter(test_d)
test_d

73分的方式调试模型:

# ================ Xgboost =====================
import xgboost as xgb
import random                              

train_data = train_d.values
train_features = train_data[:, 1:]                       # 训练数据特征
train_labels = train_data[:, 0]                          # 训练数据标签

data_length = len(train_data)
random.seed(2)                                            # 设置随机数,改变不同的种子会有不同的随机结果
sample_idx = list(range(len(train_data)))
random.shuffle(sample_idx)
split_idx = int(0.7*data_length)
train_X = train_features[sample_idx[:split_idx]]
train_y = train_labels[sample_idx[:split_idx]]
test_X = train_features[sample_idx[split_idx:]]
test_y = train_labels[sample_idx[split_idx:]]
train_y = train_y.astype('int')
test_y = test_y.astype('int')

# 数据归一化
std = StandardScaler()
train_X = std.fit_transform(train_X)
test_X = std.transform(test_X)

dtrain = xgb.DMatrix(train_X,train_y)
dtest = xgb.DMatrix(test_X)

# 用最优参数构建模型
param = {'max_depth':20, 'eta':0.3, 'objective':'binary:logistic', 'eval_metric':'logloss'}
xgboost_model = xgb.train(param, dtrain, num_boost_round=20)

predicted_y = xgboost_model.predict(dtest)
for i in range(len(predicted_y)):
    if predicted_y[i] > 0.5:
        predicted_y[i]=1
    else:
        predicted_y[i]=0

print('=========================================================================================')
print('参数: ')
print(param)
print('准确率:', accuracy_score(test_y, predicted_y))
print('精度:', precision_score(test_y, predicted_y))
print('召回率:', recall_score(test_y, predicted_y))
print('F1:', f1_score(test_y, predicted_y))

或者用K折(效果更好,但因为进行K次会更耗时)

# ================================= XGBoost =============================
import xgboost as xgb

# 设定不同的参数
"""
param_dict = dict(
    max_depth = [6],
    eta = [0.3],
    subsample = [ 0.8],
	colsample_bytree = [0.8],
    objective = ['binary:logistic'],
    eval_metric = ['error','logloss','map', 'auc'],
    seed = [0]
	gamma = [0.1]
	
)
"""
param_dict = dict(
    max_depth = [20],
	eta = [0.5],
	subsample = [1],
	colsample_bytree = [1],
	objective = ['binary:logistic'],
	eval_metric = ['error']
)
# 获得多组不同的参数组合
param_list = list_of_param_dicts(param_dict)

# 每个参数组合运行一次结果,看看哪个好
best_f1 = 0.0
best_param = None
for param in param_list:
    accs = []
    pres = []
    recalls = []
    f1s = []
    for sample_idx in range(len(train_data)):
        train_X, train_y = train_data[sample_idx]
        test_X, test_y = test_data[sample_idx]
        train_y = train_y.astype('int')
        test_y = test_y.astype('int')
        
        # 数据归一化处理
        std = StandardScaler()
        train_X = std.fit_transform(train_X)
        test_X = std.transform(test_X)
        
        dtrain = xgb.DMatrix(train_X,train_y)
        dtest = xgb.DMatrix(test_X,test_y)
        
        xgboost_model = xgb.train(param, dtrain, num_boost_round=20)
        predicted_y = xgboost_model.predict(dtest)
        
        for i in range(len(predicted_y)):
            if predicted_y[i] > 0.5:
                 predicted_y[i]=1
            else:
                predicted_y[i]=0
        accs.append(accuracy_score(dtest.get_label(), predicted_y))
        pres.append(precision_score(dtest.get_label(), predicted_y))
        recalls.append(recall_score(dtest.get_label(), predicted_y))
        f1s.append(f1_score(dtest.get_label(), predicted_y))
    print('=========================================================================================')
    print('参数: ')
    print(param)
    print('准确率:', np.mean(np.array(accs)))
    print('精度:', np.mean(np.array(pres)))
    print('召回率:', np.mean(np.array(recalls)))
    print('F1:', np.mean(np.array(f1s)))
        
    if np.mean(np.array(f1s)) > best_f1:
        best_f1 = np.mean(np.array(f1s))
        best_param = param

print('=========================================================================================')
print('best F1 score:', best_f1)
print('best parameter: ', best_param)
# 确定最优参数后用最优参数在整个训练集上训练模型
import xgboost as xgb

train_data = train_d.values
train_features = train_data[:, 1:]                       # 训练数据特征
train_labels = train_data[:, 0]                          # 训练数据标签

test_data = test_d.values                                
test_features = test_data[:,1:]                          # 预测数据特征

# 数据归一化
std = StandardScaler()
train_features = std.fit_transform(train_features)
train_labels = train_labels.astype('int')
test_features = std.transform(test_features)

dtrain = xgb.DMatrix(train_features,train_labels)
dtest = xgb.DMatrix(test_features)
# 用最优参数构建模型
param = {'max_depth': 20, 'eta': 0.3, 'subsample': 1, 'colsample_bytree': 1, 'objective': 'binary:logistic', 'eval_metric': 'logloss'}
xgboost_model = xgb.train(param, dtrain, num_boost_round=20)

# 输入没有标签的测试集特征
predicted_y = xgboost_model.predict(dtest)
for i in range(len(predicted_y)):
    if predicted_y[i] > 0.5:
        predicted_y[i]=1
    else:
        predicted_y[i]=0

# 结果组合成df,保存输出结果
df_y = pd.DataFrame(predicted_y.tolist(),columns=['pre_result'])
df_u = test_d['usr_id']
result = pd.concat([df_u,df_y], axis=1)
result.to_csv("result.csv",index=False)