## 1. 数据观察与处理

# 数据基本情况（缺失值、个字段数据类型）
train_df.info()
# 数值型字段的统计分布
train_df.describe()


import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 50)


### 1.1 空值处理

train_df.isnull().sum()
# 二者等价
train_df.isna().sum()


# Fill NA/NaN values using the specified method
# imputing missing data
# numerical data
combi['Item_Weight'].fillna(combi['Item_Weight'].mean(), inplace = True)
# categorical data
combi['Outlet_Size'].fillna("missing", inplace = True)


### 1.2 EDA

# 将认为有问题的可以看下密度曲线
plt.figure(figsize=(8, 6))
# train_df['用户近6个月平均消费值（元）'].plot(kind='kde')
sns.kdeplot(train_df['用户近6个月平均消费值（元）'])


check_feat = '当月是否看电影'
train_df[check_feat].value_counts()

for val in train_df[check_feat].unique():
plt.figure(figsize=(8, 6))
sns.distplot(train_df.loc[train_df[check_feat] == val, '信用分'].values, bins=50, kde=False)


x_cols = [col for col in train_df.columns if col not in ['信用分'] if train_df[col].dtype != 'object']

labels = []
values = []
for col in x_cols:
labels.append(col)
values.append(np.corrcoef(train_df[col].values, train_df['信用分'].values)[0, 1])
corr_df = pd.DataFrame({'cols_labels': labels, 'corr_values': values})
corr_df = corr_df.sort_values(by='corr_values')

idx = np.arange(len(labels))
width = 0.5

fig, ax = plt.subplots(figsize=(12, 40))
rects = ax.barh(idx, np.array(corr_df['corr_values'].values), color='y')
ax.set_yticks(idx)
ax.set_yticklabels(corr_df['cols_labels'].values, rotation='horizontal')
ax.set_xlabel('Correlation coefficient')
ax.set_title('Correlation coefficient of the variables')


corr_mat = train_df.corr(method='spearman')
f, ax = plt.subplots(figsize=(12, 12))

sns.heatmap(corr_mat, vmax=1., square=True)
plt.title('Important variables correlation map', fontsize=15)


## 2. 特征工程

• Categorical Encoding
• Sklearn LabelEncoder (将数据处理成 0 到 n_class-1 的结果)
• Sklearn OneHotEncoder （将数据的一个特征以 n_class-1 列的向量表示，用0、1来表示取值，有顺序关系的？）
• Pandas Categorical dtype
• pandas.get_dummies (针对字符类型会变成One-hot，数字类型会保留原有的！)
onehot_encoder = OneHotEncoder(sparse=False)
train_OneHotEncoded = onehot_encoder.fit_transform(train_Embarked)
test_OneHotEncoded = onehot_encoder.fit_transform(test_Embarked)

copy_df["EmbarkedS"] = train_OneHotEncoded[:,0]
copy_df["EmbarkedC"] = train_OneHotEncoded[:,1]
copy_df["EmbarkedQ"] = train_OneHotEncoded[:,2]
copyTest_df["EmbarkedS"] = test_OneHotEncoded[:,0]
copyTest_df["EmbarkedC"] = test_OneHotEncoded[:,1]
copyTest_df["EmbarkedQ"] = test_OneHotEncoded[:,2]


## 算法训练

### k 轮交叉验证

# k-cv
N_FOLDS = 5
y = train_df['score']
kfold = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=2019)
kf = kfold.split(X, y)

# iteration
cv_pred = np.zeros(test_df.shape[0])

count = 0
for i, (train_idx, test_idx) in enumerate(kf):
print('fold: ',i, ' training')
X_train, X_test, y_train, y_test = X.iloc[train_idx, :], X.iloc[test_idx, :], y.iloc[train_idx], y.iloc[test_idx]


### 提交结果

submit_df = test_df[['uid']]
submit_df['score'] = cv_pred
submit_df.columns = ['id', 'score']

# int
submit_df['score'] = submit_df['score'].apply(lambda x: int(np.round(x)))
submit_df.to_csv('./submission/baseline_0.06357.csv', index=False)


1. Categorical Data（标称型）
2. Numerical/Continual Data（数值型）
3. Ordinal Data（序数型）
4. Time Data（时间型）