## 1. 模型调用

### 1.1 判别式模型

#切分训练集数据
train = data[data.iyear <= 2017]
test  = data[data.iyear == 2018]

train_X = train.drop('label', axis=1)
train_y = train['label']

test_X = test.drop('label', axis=1)
test_y

#训练
from sklearn.svm import SVC, LinearSVC
svc = SVC()
svc.fit(train_X, train_y)
svc.score(train_X, train_y)

#预测
test_y = svc.predict(test_X)


### 1.1.1 判别式模型 (管道)

df = pd.read_csv('data/train.csv')
y = df.author.values
X = df.text.values

X2 = df2.text.values

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import svm

text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', svm.LinearSVC())
])
text_clf = text_clf.fit(X,y)
y2 = text_clf.predict(X2)
y2


### 1.2 生成式模型

#训练集:eventid,nlp,gname
#测试集:eventid,nlp
data = pd.concat([train,test])

#文本数据词频向量化
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf.fit(data['nlp'])
train_x = tfidf.transform(train['nlp'])
test_x = tfidf.transform(test['nlp'])
print('tfidf prepared !')

#choice 1: 对train_y进行编码映射
author_mapping_dict = {'Islamic State of Iraq and the Levant (ISIL)':0, 'Taliban':1, 'Al-Shabaab':2, 'Boko Haram':3, 'Houthi extremists (Ansar Allah)':4}
train_y = train['gname'].map(author_mapping_dict)

#choice 2:当train_y数据比较多的时候
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
train_y = enc.fit_transform(train.gname.values)

#多项式贝叶斯模型的调用
from sklearn import ensemble, metrics, model_selection, naive_bayes
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(train_x, train_y)

cv_scores = []
cv_scores.append(metrics.log_loss(train_y, predictions))
print("Mean cv score : ", np.mean(cv_scores))

prediction = clf.predict_proba(test_x)

#对应choice 1的输出：
out_df = pd.DataFrame(prediction)
out_df.columns = ['Islamic State of Iraq and the Levant (ISIL)', 'Taliban', 'Al-Shabaab', 'Boko Haram', 'Houthi extremists (Ansar Allah)']
out_df.insert(0, 'eventid', test['eventid'])
out_df.to_csv("data/out_1.csv", index=False)

#对应choice 2的输出：
out_df = pd.DataFrame(prediction)
out_df.columns = list(enc.classes_)
out_df.insert(0, 'eventid', test['eventid'])
out_df.to_csv("data/out_2.csv", index=False)

#选取概率最大值所对应的列名输出
df = out_df.drop('eventid',axis=1)
pred = df.idxmax(axis=1)
submission = pd.DataFrame({"eventid": test['eventid'],"gname": pred})
submission.to_csv('data/out.csv', index=False)


### 1.3 自定义迭代

def LGB_predict(train_x,train_y,test_x,res):
print("LGB test")
clf = lgb.LGBMClassifier(
boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,zero_as_missing=True,
max_depth=-1, n_estimators=500, objective='binary',
subsample=0.9, colsample_bytree=0.8, subsample_freq=1,
learning_rate=0.1, min_child_weight=50, random_state=2018, n_jobs=100
)
clf.fit(train_x, train_y, eval_set=[(train_x, train_y)], eval_metric='auc',early_stopping_rounds=100)
res['score'] = clf.predict_proba(test_x)[:,1]
res['score'] = res['score'].apply(lambda x: float('%.6f' % x))
res.to_csv('../data/submission.csv', index=False)
os.system('zip baseline.zip ../data/submission.csv')
return clf

model=LGB_predict(train_x,train_y,test_x,res)


## 2. 模型优化 (调参)

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# 随机森林分类模型
RFC = RandomForestClassifier()

##设置备选属性用于grid search
rf_param_grid = {"max_depth": [None],
"max_features": [1, 3, 10],
"min_samples_split": [2, 3, 10],
"min_samples_leaf": [1, 3, 10],
"bootstrap": [False],
"n_estimators" :[100,300],
"criterion": ["gini"]}
#用于系统地遍历多种参数组合，通过交叉验证确定最佳效果参数
sRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gsRFC.fit(X_train,Y_train)
#得到最佳参数组合
RFC_best = gsRFC.best_estimator_

# Best score
gsRFC.best_score_


### 2.2 交叉验证 (Cross Validation)

#按比例划分样本（随机抽样）
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)  #X为样本、y为类别、test_size为测试集比例

#通过cross_validation，设置cv=5，进行5折交叉验证，最后得到一个scores的预测准确率数组，表示每次交叉验证得到的准确率。
clf = svm.SVC(kernel='linear', C=1)
scores = cross_validation.cross_val_score(clf, X, y, cv=5)
>>>array([ 0.96...,  1.  ...,  0.96...,  0.96...,  1.        ])

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
>>>Accuracy: 0.98 (+/- 0.03)


k折： 使用k折交叉验证来寻找最优参数要比holdout方法更稳定。一旦我们找到最优参数，要使用这组参数在原始数据集上训练模型作为最终的模型。

from sklearn.cross_validation import cross_val_score # K折交叉验证模块
import matplotlib.pyplot as plt #可视化模块
from sklearn.neighbors import KNeighborsClassifier # K最近邻(kNN，k-NearestNeighbor)分类算法

#建立测试参数集
k_range = range(1, 31)
k_scores = []

#借由迭代的方式来计算不同参数对模型的影响，并返回交叉验证后的平均准确率
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
k_scores.append(scores.mean())
print('Fold %s, Acc: %.3f' %(k+1,scores))

#可视化数据
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
plt.show()

# 5折LGB模型训练
lgb_clf = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=48, max_depth=-1, learning_rate=0.02, n_estimators=6000, max_bin=425, subsample_for_bin=50000, objective='binary', min_split_gain=0,min_child_weight=5, min_child_samples=10, subsample=0.8, subsample_freq=1,colsample_bytree=0.8, reg_alpha=3, reg_lambda=0.1, seed=1000, n_jobs=-1, silent=True)

skf = list(StratifiedKFold(y_loc_train, n_folds=5, shuffle=True, random_state=1024))
baseloss = []
loss = 0

for i, (train_index, test_index) in enumerate(skf):
print("Fold", i)
lgb_model = lgb_clf.fit(X_loc_train[train_index], y_loc_train[train_index],
eval_names=['train', 'valid'],
eval_metric='logloss',
eval_set=[(X_loc_train[train_index], y_loc_train[train_index]),
(X_loc_train[test_index], y_loc_train[test_index])], early_stopping_rounds=100)
baseloss.append(lgb_model.best_score_['valid']['binary_logloss'])
loss += lgb_model.best_score_['valid']['binary_logloss']
test_pred = lgb_model.predict_proba(X_loc_test, num_iteration=lgb_model.best_iteration_)[:, 1]
print('test mean:', test_pred.mean())
res['prob_%s' % str(i)] = test_pred
print('logloss:', baseloss, loss / 5)


## 3. 常规输出

### 3.1 表格输出

submission = pd.DataFrame({
"PassengerId": test_df["PassengerId"],
"Survived": Y_pred
})
submission.to_csv('data/titanic.csv', index=False)


### 3.2 One-hot后表格输出

from sklearn import preprocessing
encoder = preprocessing.LabelBinarizer()
encoder.fit(list(set(y2)))
one_hot_labels = encoder.transform(y2)
prediction = pd.DataFrame(one_hot_labels, columns=['EAP','HPL','MWS']).to_csv('data/author-pre.csv')


## 4. 模型融合

1.缘分加权融合

import pandas as pd
result = pd.DataFrame()

a = 0.7
sub['score'] = round((((result['nffm_75866_1']*0.5+result['nffm_75866_2']*0.5)*0.5+result['nffm_75866_0']*0.5)*0.2+result['nffm_7688']*0.4+(result['nffm_763']*0.4+result['nffm_765']*0.6)*0.4)*a+result['lgb']*(1-a),6)

print(sub['score'].describe())
sub.to_csv('../submission.csv',index=False)


2.计算模型之间最大信息系数（MIC），画热力图，选择相关性小的模型进行多模型加权融合。

import pandas as pd
import numpy as np
from minepy import MINE

fs = ['discret_5','R_7199','rank','discret_10','raw_rank','Py_717','Py_725','svm_6938']
out_nums = 8   #文件数

res = []

cm = []

for i in range(out_nums):
tmp = []
for j in range(out_nums):
m = MINE()
m.compute_score(res[i], res[j])
tmp.append(m.mic())
cm.append(tmp)

#画热力图
import numpy as np
import matplotlib.pyplot as plt

def plot_confusion_matrix(cm, title, cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(out_nums)
plt.xticks(tick_marks, fs, rotation=45)
plt.yticks(tick_marks, fs)
plt.tight_layout()

plot_confusion_matrix(cm, title='mic')
plt.show()


import pandas as pd