真人麻将游戏

点击注册
点击注册
.
真人麻将游戏 你的位置:真人麻将游戏 > 棋牌资讯 >

足球预测大小球

发布日期:2022-04-04 14:26    点击次数:175

足球预测大小球

一、场景介绍像“V站”APP,可在即时盘或滚动盘中进行大小球的预测,利用机器学习预测大小球的概率。二、数据说明数据主要有两方面:即时盘或滚动盘的实时赔率数据、整场比赛的离线特征及衍生特征。备注:① 赔率数据的缺失比例跟不同博彩不同场景下的开盘有关,一般赔率数据存储所占内存较大。常用的一般是bet365,皇冠,立博,易胜博,韦德,威廉希尔等。② 整场比赛的训练特征最好做特征衍生,在预测时比赛的整场原始特征是没有的。三、模型说明对实时赔率特征、实时赔率+离线特征分别进行了lstm、bilstm和xgb的模型效果对比。四、代码说明4.1 sql提取数据(1) 实时赔率特征提取-- 主盘口-即时盘-【bet365】-亚盘/欧盘/大小球 select concat('A', '_', c.match_id) as match_id, c.company_id, c.handicap_type, c.odds_1, c.odds_2, c.odds_3 from ( select b.*, row_number() over(partition by b.match_id, b.company_id, b.handicap_type order by b.odds_date desc) as rk from ( select a.match_id, a.company_id, a.handicap_type, -- bet365的欧盘即时盘 split(regexp_replace(regexp_replace(a.odds, '\\}', ''), '\\{', ''),',')[0] as odds_1, split(regexp_replace(regexp_replace(a.odds, '\\}', ''), '\\{', ''),',')[1] as odds_2, split(regexp_replace(regexp_replace(a.odds, '\\}', ''), '\\{', ''),',')[2] as odds_3, a.odds_date from football.ft_t_odds a where a.handicap_num=1 -- 主盘口 and a.tag='即' -- 即时盘 and a.company_id in (2, 3, 5, 9, 10, 11) -- 2: BET365(英国); 3: 皇冠; 5:立博 ; 9:威廉希尔 ; 10:易胜博 ; 11:韦德 and a.handicap_type in (1, 2, 3) -- 1:亚盘; 2:欧盘; 3:大小球 and a.odds is not null and a.odds not like '%封%' ) b ) c where c.rk=1 ; (2) 离线特征及标签提取-- 亚盘盘口汇总 select distinct split_part(split_part(split_part(cast(finally_asia as varchar), '}', 1), '{', 2), ',', 2) AS asia_plate from public.ft_t_match ; -- 每场比赛的标签提取【20170101-20200930】 select c.match_id, case when c.home_score=c.away_score then 2 when c.home_score>c.away_score then 1 when c.home_score<c.away_score then 0 end as european_label, -- 欧盘-胜平负: 0-主队负, 1: 主队胜, 2: 平局 case when (c.home_score+c.away_score)=c.bigsmall_ball_hancidap then 2 when (c.home_score+c.away_score)>c.bigsmall_ball_hancidap then 1 when (c.home_score+c.away_score)<c.bigsmall_ball_hancidap then 0 end as bigsmall_ball_label, -- 大小球: 0-小球, 1: 大球, 2: 平局 case when mod(c.bigsmall_ball_hancidap/0.5, 2)=0 then 0 else 1 end as bigsmall_ball_handicap_label, -- 大小球盘口标志 (是否是.5形式,0否1是) case when c.concede_points>=0 and (c.home_score-c.concede_points)>c.away_score then 1 when c.concede_points>=0 and (c.home_score-c.concede_points)<c.away_score then 0 when c.concede_points<0 and (c.away_score+c.concede_points)>c.home_score then 1 when c.concede_points<0 and (c.away_score+c.concede_points)<c.home_score then 0 else 2 end as upperlower_plate_label, -- 上下盘:0-下盘,1-上盘,2-走盘 abs(c.concede_points) as concede_points_abs, -- 让球数(取绝对值) case when c.concede_points>=0 then 1 else 0 end as home_concede_label, -- 主队让球标志 case when c.concede_points<0 then 1 else 0 end as away_concede_label -- 客队让球标志 from ( select b.*, cast(split_part(b.finally_goal_handicap, '/', 1) as numeric) as bigsmall_ball_hancidap, case when b.asia_plate='平手/半球' then 0.25 when b.asia_plate='平手' then 0 when b.asia_plate='半球' then 0.5 when b.asia_plate='受让平手/半球' then -0.25 when b.asia_plate='半球/一球' then 0.75 when b.asia_plate='一球' then 1.0 when b.asia_plate='受让半球' then -0.5 when b.asia_plate='一球/球半' then 1.25 when b.asia_plate='受让半球/一球' then -0.75 when b.asia_plate='球半' then 1.5 when b.asia_plate='受让一球' then -1.0 when b.asia_plate='球半/两球' then 1.75 when b.asia_plate='受让一球/球半' then -1.25 when b.asia_plate='两球' then 2.0 when b.asia_plate='受让球半' then -1.5 when b.asia_plate='两球/两球半' then 2.25 when b.asia_plate='受让球半/两球' then -1.75 when b.asia_plate='受让两球' then -2.0 when b.asia_plate='两球半' then 2.5 when b.asia_plate='受让两球/两球半' then -2.25 when b.asia_plate='两球半/三球' then 2.75 when b.asia_plate='受让两球半' then -2.5 when b.asia_plate='三球' then 3.0 when b.asia_plate='受让两球半/三球' then -2.75 when b.asia_plate='受让三球' then -3.0 when b.asia_plate='三球/三球半' then 3.25 when b.asia_plate='三球半' then 3.5 when b.asia_plate='三球半/四球' then 3.75 when b.asia_plate='受让三球/三球半' then -3.25 when b.asia_plate='受让三球半' then -3.5 when b.asia_plate='四球' then 4.0 when b.asia_plate='受让三球半/四球' then -3.75 when b.asia_plate='受让四球' then -4.0 when b.asia_plate='四球半' then 4.5 when b.asia_plate='四球/四球半' then 4.25 when b.asia_plate='受让四球半' then -4.5 when b.asia_plate='四球半/五球' then 4.75 when b.asia_plate='受让四球/四球半' then -4.25 when b.asia_plate='五球' then 5.0 when b.asia_plate='受让四球半/五球' then -4.75 when b.asia_plate='受让五球' then -5.0 when b.asia_plate='五球/五球半' then 5.25 when b.asia_plate='受让五球半' then -5.5 when b.asia_plate='五球半' then 5.5 when b.asia_plate='五球半/六球' then 5.75 when b.asia_plate='受让五球/五球半' then -5.25 when b.asia_plate='受让五球半/六球' then -5.75 when b.asia_plate='六球' then 6.0 when b.asia_plate='受让六球' then -6.0 when b.asia_plate='受让六球半' then -6.5 when b.asia_plate='受让六球/六球半' then -6.25 when b.asia_plate='六球半' then 6.5 when b.asia_plate='六球/六球半' then 6.25 when b.asia_plate='受让七球' then -7.0 when b.asia_plate='受让七球/七球半' then -7.25 when b.asia_plate='七球' then 7.0 when b.asia_plate='六球半/七球' then 6.75 when b.asia_plate='七球半' then 7.5 when b.asia_plate='七球/七球半' then 7.25 when b.asia_plate='受让六球半/七球' then -6.75 when b.asia_plate='受让七球半' then -7.5 when b.asia_plate='受让七球半/八球' then -7.75 when b.asia_plate='八球半' then 8.5 when b.asia_plate='七球半/八球' then 7.75 when b.asia_plate='八球半/九球' then 8.75 when b.asia_plate='受让八球' then -8.0 when b.asia_plate='九球半/十球' then 9.75 when b.asia_plate='九球/九球半' then 9.25 when b.asia_plate='九球' then 9.0 when b.asia_plate='受让九球' then -9.0 when b.asia_plate='受让九球/九球半' then -9.25 when b.asia_plate='八球/八球半' then 8.25 when b.asia_plate='八球' then 8.0 when b.asia_plate='-13' then -13.0 when b.asia_plate='10.75' then 10.75 when b.asia_plate='受让八球半' then -8.5 when b.asia_plate='十球' then 10 when b.asia_plate='11.75' then 11.75 when b.asia_plate='九球半' then 9.5 when b.asia_plate='受让九球半' then -9.5 when b.asia_plate='受让八球半/九球' then -8.75 when b.asia_plate='-11.5' then -11.5 when b.asia_plate='受让九球半/十球' then -9.75 when b.asia_plate='10.5' then 10.5 when b.asia_plate='10.5' then 10.5 when b.asia_plate='受让八球/八球半' then -8.25 when b.asia_plate='受让十球' then -10 when b.asia_plate='11.5' then 11.5 when b.asia_plate='18.5' then 18.5 when b.asia_plate='15.5' then 15.5 when b.asia_plate='-10.5' then -10.5 when b.asia_plate='12.5' then 12.5 when b.asia_plate='14.5' then 14.5 else null end as concede_points from ( select concat('A_', a.id) as match_id, a.home_score, a.away_score, a.home_corner, a.away_corner, split_part(split_part(split_part(cast(a.finally_asia as varchar), '}', 1), '{', 2), ',', 2) AS asia_plate, -- 最终亚盘盘口, split_part(split_part(split_part(cast(a.finally_goal as varchar), '}', 1), '{', 2), ',', 2) as finally_goal_handicap -- 最终大小球盘口 from ft_t_match a where cast(a.match_time as date) BETWEEN '2017-01-01' AND '2020-09-30' and a.has_score_line='1' and a.home_score is not null and a.away_score is not null and a.finally_asia is not null and a.finally_goal is not null and cast(a.finally_asia as varchar) not like '%封%' and cast(a.finally_goal as varchar) not like '%封%' ) b ) c ; -- 离线特征提取【20170101-20200930】 select concat('A', '_', a.id) as match_id, case when a.away_half_possession is null then null else cast(split_part(a.away_half_possession, '%', 1) as float)/100 end AS away_half_possession, -- 客队半场控球率 , case when a.home_half_possession is null then null else cast(split_part(a.home_half_possession, '%', 1) as float)/100 end AS home_half_possession, -- 主队半场控球率 , case when a.away_possession is null then null else cast(split_part(a.away_possession, '%', 1) as float)/100 end AS away_possession, -- 客队全场控球率 , case when a.home_possession is null then null else cast(split_part(a.home_possession, '%', 1) as float)/100 end AS home_possession, -- 主队全场控球率 , a.home_attack, a.away_attack, a.home_dangerous_attack, a.away_dangerous_attack, case when a.home_attack is null or a.home_attack=0 then null else round(a.home_dangerous_attack/a.home_attack, 2) end AS home_danger_rate, -- 主队危险进攻率, case when a.away_attack is null or a.away_attack=0 then null else round(a.away_dangerous_attack/a.away_attack, 2) end AS away_danger_rate, -- 客队危险进攻率, a.home_red, a.away_red, a.home_yellow, a.away_yellow, a.home_shoot, a.away_shoot, a.away_shoot_on, a.home_shoot_on, a.home_shoot_out, a.away_shoot_out, case when a.home_shoot is null or a.home_shoot=0 then null else round(a.home_shoot_on/a.home_shoot, 2) end AS home_shoot_rate, -- 主队射正率, case when a.away_shoot is null or a.away_shoot=0 then null else round(a.away_shoot_on/a.away_shoot, 2) end AS away_shoot_rate, -- 客队射正率, a.away_corner, a.home_corner, a.away_half_corner, a.home_half_corner, a.home_shoot_stop, a.away_shoot_stop, a.home_pass, a.away_pass, a.home_pass_success, a.away_pass_success, a.home_foul, a.away_foul, a.home_head, a.away_head, a.home_head_success, a.away_head_success, a.home_follow, a.away_follow, a.home_slide, a.away_slide, a.home_past, a.away_past, a.home_out, a.away_out, a.home_on_door, a.away_on_door, a.home_steals, a.away_steals, a.home_assists, a.away_assists, a.home_first, a.away_first, a.home_intercept, a.away_intercept from ft_t_match a where cast(a.match_time as date) BETWEEN '2017-01-01' AND '2020-09-30' and a.has_score_line='1' and a.home_score is not null and a.away_score is not null and a.finally_asia is not null and a.finally_goal is not null and cast(a.finally_asia as varchar) not like '%封%' and cast(a.finally_goal as varchar) not like '%封%' ;4.2 数据整合将sql提取的数据整合成三个文件:比赛标签(match_label_data.csv)、实时赔率数据(odds_features.csv)、离线特征(offlinefeatures.csv),数据之间根据match_id进行关联。4.3 模型训练 因为最终模型选择了xgb,只做xgb的模型训练展示。#!/usr/bin/env python # coding: utf-8 import os import numpy as np import pandas as pd import warnings warnings.filterwarnings("ignore"), color_codes=True) # 设置绘图风格 mpl.rcParams['font.family'] = ['sans-serif'] mpl.rcParams['font.sans-serif'] = ['SimHei'] # 正常显示中文标签 mpl.rcParams['axes.unicode_minus'] = False plt.rcParams['figure.figsize'] = (15.0, 10.0) # 设置图形大小 plt.rcParams['savefig.dpi'] = 200 # 图片像素 plt.rcParams['figure.dpi'] = 200 # 分辨率 # # 数据读取 # ## 比赛标签数据 match_label_data = pd.read_csv("./data/final_dataset/match_label_data.csv",) print(match_label_data.shape) print(match_label_data.head()) print(match_label_data.dtypes) # ## 比赛离线特征 # 不含标签列 match_offline_features = pd.read_csv("./data/final_dataset/offline_features.csv",) print(match_offline_features.shape) print(match_offline_features.head()) print(match_offline_features.dtypes) # ## 比赛最终滚盘赔率 [皇冠] hg_instant_feature = pd.read_csv("./data/final_dataset/hg_instant_feature_data.csv",) print(hg_instant_feature.shape) print(hg_instant_feature.head()) # ## 实时赔率特征 odds_features = pd.read_csv("./data/final_dataset/odds_features.csv",) print(odds_features.shape) print(odds_features.head()) print(odds_features.dtypes) # bet365赔率特征提取 feature_idx = odds_features.columns.str.contains("^bet365.+", regex=True) # print(feature_idx) bet365_features = ["match_id"],),) match_all_data,) print(match_all_data.shape) print(match_all_data.dtypes) # 标签分布统计 match_all_data.groupby(["bigsmall_ball_label"])["match_id"].count() # 剔除平局 match_binary_data = match_all_data[match_all_data["bigsmall_ball_label"] != 2] match_binary_data.drop(["european_label", "upperlower_asia_label"], axis=1, inplace=True) match_binary_data.groupby(["bigsmall_ball_label"])["match_id"].count() # # 数据探索性分析 # ## 缺失率统计 # 数值型变量统计 match_binary_numeric_stat = match_binary_data.select_dtypes(include=["float64", "int64"]).describe().T, ascending=False) print(match_binary_numeric_stat.head()) # match_binary_numeric_stat.to_csv("./feature_statistic/match_binary_numeric_missing_stat.csv",) # 结果保存 missing_pct_threshold = 0.3 # 缺失阈值 # 缺失率小于0.3的数值型特征进行统计 numeric_list = match_binary_numeric_stat[match_binary_numeric_stat["missing_pct"] < missing_pct_threshold].index.tolist() print("因子型特征保留缺失率低于{0}的特征有{1}个。".format(missing_pct_threshold, len(numeric_list))) print(numeric_list) # 赛前预测(利用bet365特征) bet365_features.remove("match_id") numeric_list): """ df: 数据集 feature_list: 特征列表 target: 统计目标 feature_type: 特征类型。默认为numeric. 若为因子型特征,取object:按照占比降序排列; 若为数字型特征,取numeric(float64 & int64):按照特征值升序排列。 """ # 创建一个空的数据框 return_stat = pd.DataFrame(columns = ["value", "count", "pct", "feature"]) for col in feature_list: if col == target: continue else: N = len(df) col_stat = pd.DataFrame(df.groupby(col)[target].count()) col_stat.reset_index(level=0, inplace=True) col_stat.rename(columns={col: "value", target: "count"}, inplace=True) n_value = len(col_stat) col_stat["pct"] = col_stat.apply(lambda x: x[1] / N, axis=1) col_stat["feature"] = [col for i in range(n_value)] # print(col_stat) if feature_type == "object": , axis=0, ascending=False) # col_stat = col_stat.sort(columns=["pct"], axis=0, ascending=False) else: col_stat = col_stat.sort_values("value",,)) # numeric_feature_stat.to_csv("./feature_statistic/offline_numeric_feature_pct_stat.csv",, index=False) # ### 特征分布可视化 # 连续特征:直方图、分组直方图;核密度图、小提琴图(分位数) 、分组箱线图 def continuous_feature_plot(df, hist_feature_list, n_bins=50, fontsize=14, target=None): """ 连续特征的直方图和核密度图。若target不为空,同时展示分组直方图和分组箱线图. hist_feature_list: 连续特征列表. n_bins: 直方图分多少箱, 默认50箱. fontsize: 字体大小,默认为14. target: 目标变量,当前固定为2个(0: 好用户,1:坏用户). """ for col in hist_feature_list: print("连续特征:",) # 直方图 plt.subplot(221) plt.tight_layout() sbn.distplot(df[col]) plt.xlabel(col, fontdict={'weight':'normal', 'size': fontsize}) plt.title("{col}--直方图".format(col=col),,,) plt.xlabel(col, fontdict={'weight':'normal', 'size': fontsize}) plt.title("{col}--小提琴图".format(col=col), fontdict={'weight':'normal', 'size': fontsize}) print("进行分组可视化......") unique_vals = df[target].unique().tolist() unique_val0 = df[df[target] == unique_vals[0]] unique_val1 = df[df[target] == unique_vals[1]] # unique_val2 = df[df[target] == unique_vals[2]] # 分组直方图 plt.subplot(223) plt.tight_layout() sbn.distplot(unique_val0[col], bins=n_bins, kde=False, norm_hist=True, color='steelblue', label=str(unique_vals[0])) sbn.distplot(unique_val1[col], bins=n_bins, kde=False, norm_hist=True, color='purple', label=str(unique_vals[1])) # sns.distplot(unique_val2[col], bins=n_bins, kde=False, norm_hist=True, color='pink', label=str(unique_vals[2])) plt.xlabel(col, fontdict={'weight':'normal', 'size': fontsize}) plt.legend() plt.title("{col}--分组直方图".format(col=col), fontdict={'weight':'normal', 'size': fontsize}) # 分组核密度图 plt.subplot(224) plt.tight_layout() sbn.distplot(unique_val0[col], hist=False, kde_kws={"color":"red", "linestyle":"-"}, norm_hist=True, label=str(unique_vals[0])) sbn.distplot(unique_val1[col], hist=False, kde_kws={"color":"black", "linestyle":"--"}, norm_hist=True, label=str(unique_vals[1])) # sns.distplot(unique_val2[col], hist=False, kde_kws={"color":"green", "linestyle":"-."}, norm_hist=True, label=str(unique_vals[2])) plt.xlabel(col, fontdict={'weight':'normal', 'size': fontsize}) plt.legend() plt.title("{col}--分组核密度图".format(col=col), fontdict={'weight':'normal', 'size': fontsize}) """ 分组箱线图 """ # plt.subplot(222) # plt.tight_layout() # sns.boxplot(x=[unique_val0[col], unique_val1[col]], labels=[unique_vals[0], unique_vals[1]]) # plt.xlabel(col, fontdict={'weight':'normal', 'size': fontsize}) # plt.title("{col}特征的分组箱线图".format(col=col),) # 直方图 plt.subplot(121) plt.tight_layout() sbn.distplot(df[col]) plt.xlabel(col, fontdict={'weight':'normal', 'size': fontsize}) plt.title("{col}--直方图".format(col=col),,,) plt.xlabel(col, fontdict={'weight':'normal', 'size': fontsize}) plt.title("{col}--小提琴图".format(col=col), fontdict={'weight':'normal', 'size': fontsize}) plt.savefig("{col}--直方图&箱线图.png".format(col=col)) 为目标变量进行可视化 def continuous_visualize(df, hist_feature_list, n_bins=50, fontsize=14, target=None, exclude_cols=None): for col in hist_feature_list: if (col != target) & (col not in exclude_cols): mid_data = df[df[col] != -999] continuous_feature_plot(mid_data, [col], n_bins=n_bins, fontsize=fontsize, target=target) exclude_cols=["away_danger_rate", "home_danger_rate", "away_shoot_rate", "home_shoot_rate", "away_red", "home_red"], exclude_cols=exclude_cols) # 漏网之鱼 # continuous_visualize(football_copy, ["away_danger_rate"],) # ## 二分类 # ### 数据提取 bigsmall_ball_binary_xgb = match_binary_data[numeric_list] print(bigsmall_ball_binary_xgb.shape) bigsmall_ball_binary_xgb.dtypes # ### 划分数据集 # binary classificationl for xgboost model training # bigsmall_ball_binary_xgb.drop("match_id", axis=1, inplace=True) X0 = bigsmall_ball_binary_xgb.drop("bigsmall_ball_label", axis=1) y0 = bigsmall_ball_binary_xgb["bigsmall_ball_label"] # 划分训练集、测试集和验证集 X, X_verify, y, y_verify = train_test_split(X0, y0, train_size=0.95, random_state=1234) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=1234) print("The size of train set:", X_train.shape) print("The size of test set:", X_test.shape) print("The size of verify set:",, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): """ Define a function for modeling and cross-validation. This function will do the following: ① fit the model ② determine training accuracy ③ determine training AUC ④ determine testing AUC ⑤ update n_estimators with cv function of xgboost package ⑥ plot Feature Importance ⑦ Record the values of each feature under different measures of feature importance :param alg: model :param X_train: train set :param y_train: train label :param X_test: test set :param y_test: test label :param useTrainCV: Whether choose cross validation :param cv_folds: The number of CV folds :param early_stopping_rounds: The rounds of early stopping :return: Feature importance is stored in the current folder """ if useTrainCV: xgb_param = alg.get_xgb_params() dtrain = xgb.DMatrix(X_train, label=y_train) cvResult = xgb.cv(xgb_param, dtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, # metrics='merror', # mlogloss metrics=evaluate_metric, early_stopping_rounds=early_stopping_rounds) print(cvResult.tail(10)) alg.set_params(n_estimators=cvResult.shape[0]) # fit model # alg.fit(X_train, y_train, eval_metric='merror') # mlogloss alg.fit(X_train, y_train, eval_metric=evaluate_metric) # Predict training set dtrainPred = alg.predict(X_train) dtrainPredprob = alg.predict_proba(X_train)[:, 1] # Print model report print("\nModel Report") print("Accuracy : {}".format(metrics.accuracy_score(y_train, dtrainPred))) print("AUC Score (Train): {}".format(metrics.roc_auc_score(y_train, dtrainPredprob))) # Predict on testing data: testPredprob = alg.predict_proba(X_test)[:,1] print("AUC Score (Test): {}".format(metrics.roc_auc_score(y_test, testPredprob))) featImportStat = pd.DataFrame(X_train.columns, columns=["feature"]) # 特征重要性 for importType in ['weight', 'total_gain', 'total_cover', 'gain', 'cover']: print("importType: {0}".format(importType)) plot_importance(alg, importance_type=importType) plt.savefig(importType + ".png") , columns=[importType]) importResult.reset_index(col_level=0, inplace=True) importResult.rename(columns = {"index": "feature"}, inplace=True) featImportStat = featImportStat.merge(importResult, on=["feature"],) featImportStat.to_csv("./feature_importance/feature_importance_binary_xgb.csv",,, ), nthread=4, # 运行的线程数 learning_rate=0.1, # 学习率,默认0.3 gamma=0, # 节点分裂所需的最小损失函数下降值,一般在0.01-0.2之间 max_depth=6, # 每棵树的最大深度,默认为6 min_child_weight=1, # 最小叶节点的样本权重和 max_delta_step=0, # 限制每棵树权重改变的最大步长,默认为0 n_estimators=200, # 迭代轮次 subsample=0.8, # 控制对每棵树随机采样的比例 colsample_bytree=0.8, # 列采样 # reg_lambda=1, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。 silent=0, # 输出运行信息 objective='binary:logistic', # 二分类 # num_class=3, # 类别数 seed=1234), param_grid=param_test1, scoring='roc_auc', n_jobs=4, iid=False, cv=5) gridsearch1.fit(X_train, y_train) print(gridsearch1.cv_results_["mean_test_score"]), nthread=4, # 运行的线程数 learning_rate=0.1, # 学习率,默认0.3 gamma=0, # 节点分裂所需的最小损失函数下降值,一般在0.01-0.2之间 max_depth=4, # 每棵树的最大深度,默认为6 min_child_weight=7, # 最小叶节点的样本权重和 max_delta_step=0, # 限制每棵树权重改变的最大步长,默认为0 n_estimators=200, # 迭代轮次 subsample=0.8, # 控制对每棵树随机采样的比例 colsample_bytree=0.8, # 列采样 # reg_lambda=1, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。 silent=0, # 输出运行信息 objective='binary:logistic', # 二分类 # num_class=3, # 类别数 seed=1234), param_grid=param_test2, scoring='roc_auc', n_jobs=4, iid=False, cv=5) gridsearch2.fit(X_train, y_train) print(gridsearch2.cv_results_["mean_test_score"]), nthread=4, # 运行的线程数 learning_rate=0.1, # 学习率,默认0.3 gamma=0, # 节点分裂所需的最小损失函数下降值,一般在0.01-0.2之间 max_depth=4, # 每棵树的最大深度,默认为6 min_child_weight=7, # 最小叶节点的样本权重和 max_delta_step=0, # 限制每棵树权重改变的最大步长,默认为0 n_estimators=200, # 迭代轮次 subsample=0.8, # 控制对每棵树随机采样的比例 colsample_bytree=0.8, # 列采样 # reg_lambda=1, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。 silent=0, # 输出运行信息 objective='binary:logistic', # 二分类 # num_class=3, # 类别数 seed=1234), param_grid=param_test3, scoring='roc_auc', n_jobs=4, iid=False, cv=5) gridsearch3.fit(X_train, y_train) print(gridsearch3.cv_results_["mean_test_score"]) # 不同参数下交叉验证结果 print(gridsearch3.best_params_) print(gridsearch3.best_score_) # #### subsample和colsample_bytree调优 param_test4 = { 'subsample': [i/10.0 for i in range(6, 10)], 'colsample_bytree': [i/10.0 for i in range(6, 10)] } gridsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=200, max_depth=6, min_child_weight=1, gamma=0.2, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27), param_grid=param_test4, scoring='roc_auc', n_jobs=4, iid=False, cv=5) gridsearch4.fit(X_train, y_train) print(gridsearch4.cv_results_["mean_test_score"]) # 不同参数下交叉验证结果 print(gridsearch4.best_params_) print(gridsearch4.best_score_) param_test41 = { 'subsample': [i/10.0 for i in range(6, 10)], 'colsample_bytree': [i/10.0 for i in range(6, 10)] } gridsearch41 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=205, max_depth=6, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27), param_grid=param_test41, scoring='roc_auc', n_jobs=4, iid=False, cv=5) gridsearch41.fit(X_train, y_train) print(gridsearch41.cv_results_["mean_test_score"]) # 不同参数下交叉验证结果 print(gridsearch41.best_params_) print(gridsearch41.best_score_) # #### 正则化参数调优 param_test5 = { 'reg_alpha': [1e-5, 1e-4, 1e-3, 0.01, 0.05, 0.1, 0.5, 1] } gridsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=200, max_depth=6, min_child_weight=1, gamma=0.2, reg_alpha=0, subsample=0.7, colsample_bytree=0.9, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27), param_grid=param_test5, scoring='roc_auc', n_jobs=4, iid=False, cv=5) gridsearch5.fit(X_train, y_train) print(gridsearch5.cv_results_["mean_test_score"]) # 不同参数下交叉验证结果 print(gridsearch5.best_params_) print(gridsearch5.best_score_) # #### 降低学习率 param_test6 = { 'learning_rate': [x/100 for x in range(1, 10)] } gridsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=200, max_depth=6, min_child_weight=1, gamma=0.2, subsample=0.7,

a腾讯棋牌类游戏都有这种机制:你充钱它就会记录下来,在冲后的一段时间内它会先让你赢,过一段时间后你会发现赢了多少就会输多少。我玩这个游戏先冲了一点钱,慢慢的赢了两千万吧,然后去富商场玩(还不是尊爵场)真钱麻将游戏,结果对家胡了一个三根杠开花的海底捞月的清金钩钓,瞬间输光。当你输光后就会心里痒痒,想要再次冲钱,结果发现只是又是按照流程走了一遍已赞过已踩过收起热心网友2018-10-07