6 回归集成:xgb、lgb、cat
这个代码是从kaggle上拷贝过来的:
(图片来源网络,侵删)
- 如何使用三个树模型模块化训练;
- 文本特征如何做,如何挖掘;
- 时间特征的处理;
- 模型权重集成;
import pandas as pd import math import numpy as np import joblib import optuna from lightgbm import LGBMRegressor from catboost import CatBoostRegressor from xgboost import XGBRegressor from sklearn.preprocessing import * from sklearn.metrics import * from sklearn.model_selection import * from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import TfidfVectorizer import datetime import gc from sklearn.base import clone pd.set_option('display.max_columns', None) import warnings warnings.filterwarnings("ignore") d_s = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/solution_example.csv') te_d = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/test.csv') tr_d = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/train.csv') tr_d.drop('id',axis=1,inplace=True) te_d.drop('id',axis=1,inplace=True) tr_d['holiday_name'].fillna('None', inplace=True) te_d['holiday_name'].fillna('None', inplace=True) def Process_Date(Df): Df['date'] = pd.to_datetime(Df['date']) Df['year'] = Df['date'].dt.year Df['day'] = Df['date'].dt.day Df['month'] = Df['date'].dt.month Df['month_name'] = Df['date'].dt.month_name() Df['day_of_week'] = Df['date'].dt.day_name() Df['week'] = Df['date'].dt.isocalendar().week Df['year_sin'] = np.sin(2 * np.pi * Df['year']) Df['year_cos'] = np.cos(2 * np.pi * Df['year']) Df['month_sin'] = np.sin(2 * np.pi * Df['month'] / 12) Df['month_cos'] = np.cos(2 * np.pi * Df['month'] / 12) Df['day_sin'] = np.sin(2 * np.pi * Df['day'] / 31) Df['day_cos'] = np.cos(2 * np.pi * Df['day'] / 31) Df['group']=(Df['year']-2020)*48+Df['month']*4+Df['day']//7 Df['total_holidays_month'] = Df.groupby(['year', 'month'])['holiday'].transform('sum') Df['total_shops_closed_week'] = Df.groupby(['year', 'week'])['shops_closed'].transform('sum') Df['group_sin'] = np.sin(2 * np.pi * Df['group'] / Df['group'].max()) Df['group_cos'] = np.cos(2 * np.pi * Df['group'] / Df['group'].max()) return Df tr_d = Process_Date(tr_d) te_d = Process_Date(te_d) tr_d = tr_d[['warehouse', 'date', 'holiday_name', 'holiday', 'shops_closed', 'winter_school_holidays', 'school_holidays', 'year', 'day', 'month', 'month_name', 'day_of_week', 'week', 'year_sin', 'year_cos', 'month_sin', 'month_cos', 'day_sin', 'day_cos', 'group', 'total_holidays_month', 'total_shops_closed_week', 'group_sin', 'group_cos', 'orders']] le_month = LabelEncoder() le_week = LabelEncoder() le_war = LabelEncoder() tr_d['month_name'] = le_month.fit_transform(tr_d['month_name']) tr_d['day_of_week'] = le_week.fit_transform(tr_d['day_of_week']) tr_d['warehouse'] = le_war.fit_transform(tr_d['warehouse']) te_d['month_name'] = le_month.transform(te_d['month_name']) te_d['day_of_week'] = le_week.transform(te_d['day_of_week']) te_d['warehouse'] = le_war.transform(te_d['warehouse']) def apply_tfidf_svd(df, text_column, max_features=1000, n_components=10): vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english') vectors = vectorizer.fit_transform(df[text_column]) svd = TruncatedSVD(n_components) x_sv = svd.fit_transform(vectors) tfidf_df = pd.DataFrame(x_sv) cols = [(text_column + "_tfidf_" + str(f)) for f in tfidf_df.columns.to_list()] tfidf_df.columns = cols df = df.reset_index(drop=True) df = pd.concat([df, tfidf_df], axis="columns") return df tr_d = apply_tfidf_svd(tr_d,'holiday_name') te_d = apply_tfidf_svd(te_d,'holiday_name') tr_d.drop(['date','holiday_name'],axis=1,inplace=True) te_d.drop(['date','holiday_name'],axis=1,inplace=True) print(f"Shape Of Train Data is {tr_d.shape}") print(f"Shape Of Test Data is {te_d.shape}") %%time X = tr_d.drop('orders',axis=1) y =tr_d['orders'] def cross_validate(model, n_splits=15): scores = [] test_preds = np.zeros(len(te_d)) groups = X['group'] kfold = GroupKFold(n_splits=n_splits) for fold, (train_index, valid_index) in enumerate(kfold.split(X, y, groups=groups)): X_train = X.iloc[train_index] y_train = y.iloc[train_index] X_val = X.iloc[valid_index] y_val = y.iloc[valid_index] m = clone(model) m.fit(X_train, y_train, eval_set=[(X_val, y_val)]) y_pred = m.predict(X_val) score = mean_absolute_percentage_error(y_val, y_pred) scores.append(score) test_preds += m.predict(te_d) / n_splits gc.collect() print(f" MAPE mean: {np.array(scores).mean():.7f} (+- {np.array(scores).std():.7f})") return test_preds %%time SEED = 2375 cat = CatBoostRegressor(verbose=0,learning_rate=0.01,iterations=2000, random_state = SEED) cat_test_preds = cross_validate(cat) SEED = 1023 xgb = XGBRegressor(n_estimators=1000,learning_rate=0.05,verbosity=0, random_state=SEED) xgb_test_preds = cross_validate(xgb) %%time lgb = LGBMRegressor(verbose=-1, random_state = SEED ) lgb_test_preds = cross_validate(lgb) %%time weights = { 'cat_test_preds': 0.45, 'lgb_test_preds': 0.45, 'xgb_test_preds': 0.1, } cat_test_preds_weighted = cat_test_preds * weights['cat_test_preds'] lgb_test_preds_weighted = lgb_test_preds * weights['lgb_test_preds'] xgb_test_preds_weighted = xgb_test_preds * weights['xgb_test_preds'] ensemble_preds = cat_test_preds_weighted + lgb_test_preds_weighted + xgb_test_preds_weighted d_s['orders'] = ensemble_preds d_s['id'] = d_s['id'] d_s.to_csv('Submission.csv', index=False) print(d_s.head())
文章版权声明:除非注明,否则均为主机测评原创文章,转载或复制请以超链接形式并注明出处。