## 分桶

#Binning:
def binning(col, cut_points, labels=None):
#Define min and max values:
minval = col.min()
maxval = col.max()

#create list by adding min and max to cut_points
break_points = [minval] + cut_points + [maxval]

#if no labels provided, use default labels 0 ... (n-1)
if not labels:
labels = range(len(cut_points)+1)

#Binning using cut function of pandas
colBin = pd.cut(col,bins=break_points,labels=labels,include_lowest=True)
return colBin

#Binning age:
cut_points = [90,140,190]
labels = ["low","medium","high","very high"]
data["LoanAmount_Bin"] = binning(data["LoanAmount"], cut_points, labels)
print pd.value_counts(data["LoanAmount_Bin"], sort=False)


## Group 处理

# 按一列聚类，查看聚类后另一列的取值有哪些
item_attr.groupby('store_id')['item_price'].unique()
# 按一列聚类，求聚类后另一列结果的均值
item_attr.groupby('store_id')['item_price'].mean()
# 找到有着 5 个以上商品价格的店，做平均
tmp_price = item_attr.groupby('store_id')
tmp_price.filter(lambda x: x['item_price'].count()>=5).groupby('store_id')['item_price'].mean()


## 方法待整理

# 只对 df 进行操作，不能对 groupby 的结果进行操作
df.reset_index
xx_merged.groupby('cate_id')['item_id'].count().idxmax()
# 获取多个列的数据
df1 = df[['a','b']]
* 

## Agg 用法

## Merge 用法

python
xx_merged = pd.merge(train_xx, item_attr, left_on='item_id', right_on='item_id', how='left')


## Datetime 处理

from datetime import datetime
# data format: 2018-08-14 03:10:41
train['create_order_time'].astype(datetime)


## 划分数据

In [11]: df = pd.DataFrame(np.random.randn(100, 2))
In [12]: msk = np.random.rand(len(df)) < 0.8
In [13]: train = df[msk]
In [14]: test = df[~msk]
In [15]: len(test)
Out[15]: 21
In [16]: len(train)
Out[16]: 79


## 数据处理

# Python 中复制一个 DataFrame
data1 = data_raw.copy(deep = True)

# 读入 *.csv 格式的数据

# 查看数据的列名
train_df.columns

# 舍弃某一列
train_df.drop(['Item_Outlet_Sales'], axis=1, inplace=True)

# 通过列元素取值区间截取子集
df = df[(df['closing_price'] >= 99) & (df['closing_price'] <= 101)]

# 连接训练数据和测试数据
combi = train_df.append(test_df, ignore_index=True)

# 检查所有的缺失数据
combi.isnull().sum()

# 给缺失值填充数据
## numerical data
combi['Item_Weight'].fillna(combi['Item_Weight'].mean(), inplace = True)
## categorical data
combi['Outlet_Size'].fillna("missing", inplace = True)

# 查看数据每一列的取值和每个取值出现的次数
combi['Item_Fat_Content'].value_counts()

# 去除为空的部分
## 舍弃包含任一空项的行
df.dropna()
## 舍弃特定列中含有空项的行
dr['col_name'].dropna()


### 将为数不多的取值映射到固定几个取值上

# >> combi['Item_Fat_Content'].value_counts()
# Low Fat    5089
# Regular    2889
# LF          316
# reg         117
# low fat     112
# Name: Item_Fat_Content, dtype: int64

# only two categories: Low Fat, Regular

# dictionary to replace the categories
fat_content_dict = {'Low Fat':0, 'Regular':1, 'LF':0, 'reg':1, 'low fat':0}

combi['Item_Fat_Content'] = combi['Item_Fat_Content'].replace(fat_content_dict, regex=True)


combi.replace({'Item_Fat_Content': fat_content_dict}, inplace=True)


### 将数据重新排序

feature_matrix = feature_matrix.reindex(index=combi['id'])
# TODO what is this step for?
feature_matrix = feature_matrix.reset_index()


### iloc, loc and ix

ix 已经开始过时了。

• loc gets rows (or columns) with particular labels from the index.
• iloc gets rows (or columns) at particular positions in the index (so it only takes integers).
>>> s.iloc[:3] # slice the first three rows
49   NaN
48   NaN
47   NaN

>>> s.loc[:3] # slice up to and including label 3
49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN

>>> s.ix[:3] # the integer is in the index so s.ix[:3] works like loc
49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN


df.ix[:, n]


### 字符串中提取字段

Name属性是类似的形式 ‘Braund, Mr. Owen Harris’、’Futrelle, Mrs. Jacques Heath (Lily May Peel)’和’Heikkinen, Miss. Laina’，现在想在Name属性中截取出称呼，可以用如下的方式：

combi['Title'] = combi['Name'].str.extract('([A-Za-z]+)\.', expand=True)


expand=True使得得到的结果是 Dataframe，否则为 Series。结果就变成了 Mr, Mrs 和 Miss。

### 填充为空的部分

DataFrame.fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None, **kwargs)
# Fill NA/NaN values using the specified method


### 存储 csv 文件

my_submission = pd.DataFrame({'PassengerId': passenger_id, 'Survived': y_pred})
my_submission.to_csv('auto_ft_submission.csv', index=False)


### 连续数值类型离散化 cut & qcut 函数

cut 函数主要是考虑在分桶的时候根据每个元素的取值能够平滑的划分每个区间，其实从数值角度考虑；而 qcut 是划分分位点，即其实从划分得到的每个区域内，元素个数要均衡的从角度出发划分区间的。

# Binning numerical columns
# q=4 means 4 quantiles 0, 1, 2, 3
# labels=False are numbers, not characters
data['CatAge'] = pd.qcut(data.Age, q=4, labels=False )


# Define the bins
mybins = range(0, df.age.max(), 10)

# Cut the data from the DataFrame with the help of the bins
df['age_bucket'] = pd.cut(df.age, bins=mybins)

# Count the number of values per bucket
df['age_bucket'].value_counts()


### 数值特征离散化

# Factorize the values
labels, uniques = pd.factorize(trian_df.Class)

# Save the encoded variables in iris.Class
trian_df.Class = labels

# Print out the first rows


### 数值特征归一化

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X)

rescaledX = scaler.transform(X)


### 给 DataFrame 中某一个位置赋值

train_df.at[train_df.label=='音乐', 'label'] = 1


## Pandas 用于EDA 绘图

• df.hist()
• df.pie()
• df.value_counts().plot.pie()
• df.value_counts().plot.bar()