LightGBM
# lightgbm
params = {
'learning_rate': 0.01,
'boosting_type': 'gbdt',
'objective': 'regression_l1',
'metric': 'mae',
'feature_fraction': 0.6,
'bagging_fraction': 0.8,
'bagging_freq': 2,
'num_leaves': 31,
'verbose': -1,
'max_depth': 5,
'lambda_l1': 0,
'lambda_l2': 2.5,
'nthread': 4
}
# k-cv
N_FOLDS = 5
y = train_df['信用分']
kfold = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=2019)
kf = kfold.split(X, y)
# process the k-cv
cv_pred = np.zeros(test_df.shape[0])
valid_best_l2_all = 0
feature_importance_df = pd.DataFrame()
count = 0
for i, (train_idx, test_idx) in enumerate(kf):
print('fold: ',i, ' training')
X_train, X_test, y_train, y_test = X.iloc[train_idx, :], X.iloc[test_idx, :], y.iloc[train_idx], y.iloc[test_idx]
# X_train, X_test, y_train, y_test = X[train_idx, :], X[test_idx, :], y[train_idx], y[test_idx]
data_train = lgb.Dataset(X_train, y_train)
data_test = lgb.Dataset(X_test, y_test)
lgb_model = lgb.train(params, data_train, num_boost_round=10000, valid_sets=data_test,
verbose_eval=-1, early_stopping_rounds=50)
cv_pred += lgb_model.predict(X_submit, num_iteration=lgb_model.best_iteration)
valid_best_l2_all += lgb_model.best_score['valid_0']['l1']
# fold_importance_df = pd.DataFrame()
# fold_importance_df["feature"] = list(unicode(X_train.columns))
# fold_importance_df["importance"] = lgb_model.feature_importance(importance_type='gain', iteration=lgb_model.best_iteration)
# fold_importance_df["fold"] = count + 1
# feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
count += 1
cv_pred /= N_FOLDS
valid_best_l2_all /= N_FOLDS
print('cv score for valid is: ', 1 / (1 + valid_best_l2_all))
# show the importance of features
# display_importances(feature_importance_df)
保存训练好的机器学习模型
训练好一个模型后可以存下来,下次直接用,省得花很多时间继续跑了。
使用 Python 自带的 pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
import pickle
#方法一:python自带的pickle
(X,y) = datasets.load_iris(return_X_y=True)
rfc = RandomForestClassifier(n_estimators=100,max_depth=100)
rfc.fit(X,y)
print(rfc.predict(X[0:1,:]))
#save model
f = open('saved_model/rfc.pickle','wb')
pickle.dump(rfc,f)
f.close()
#load model
f = open('saved_model/rfc.pickle','rb')
rfc1 = pickle.load(f)
f.close()
print(rfc1.predict(X[0:1,:]))
使用 sklearn 中的模块 joblib
说是 joblib 会更快速一点。
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.externals import joblib
#方法二:使用sklearn中的模块joblib
(X,y) = datasets.load_iris(return_X_y=True)
rfc = RandomForestClassifier(n_estimators=100,max_depth=100)
rfc.fit(X,y)
print(rfc.predict(X[0:1,:]))
#save model
joblib.dump(rfc, 'saved_model/rfc.pkl')
#load model
rfc2 = joblib.load('saved_model/rfc.pkl')
print(rfc2.predict(X[0:1,:]))