调参

GridSearchCV() 对估算器指定参数值进行详尽搜索

from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)

grid_search.best_params_
# Out:{'max_features': 8, 'n_estimators': 30}

grid_search.best_estimator_
# Out:
# RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
#            max_features=8, max_leaf_nodes=None, min_impurity_decrease=0.0,
#            min_impurity_split=None, min_samples_leaf=1,
#            min_samples_split=2, min_weight_fraction_leaf=0.0,
#            n_estimators=30, n_jobs=1, oob_score=False, random_state=42,
#            verbose=0, warm_start=False)

estimator : estimator object.每个估算器需要提供一个score函数或填写scoring参数。
param_grid : dict or list of dictionaries，键作为参数名称，list 作为参数的字典。或存有这样的字典的列表。
scoring : string, callable, list/tuple, dict or None, default: None，
cv : int, cross-validation generator or an iterable, optional，如果是整数，则代表 KFold

refit : boolean, or string, default=True，应用已找到的最好的参数到整个数据集上。

Methods	description
decision_function(X)	Call decision_function on the estimator with the best found parameters.
fit(X[, y, groups])	Run fit with all sets of parameters.
get_params([deep])	Get parameters for this estimator.
inverse_transform(Xt)	Call inverse_transform on the estimator with the best found params.
predict(X)	Call predict on the estimator with the best found parameters.
predict_log_proba(X)	Call predict_log_proba on the estimator with the best found parameters.
predict_proba(X)	Call predict_proba on the estimator with the best found parameters.
score(X[, y])	Returns the score on the given data, if the estimator has been refit.
set_params(**params)	Set the parameters of this estimator.
transform(X)	Call transform on the estimator with the best found parameters.

RandomizedSearchCV()

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(housing_prepared, housing_labels)

estimator : estimator object.指定估算器对象。
param_distributions : dict，给定以参数名为键，list 为参数的字典。或提供一个分布，分布必须提供一个rvs方法进行采样，例如来自 scipy.stats.distributions 的方法。
n_iter : int, default=10，采样参数设置数量。
scoring : string, callable, list/tuple, dict or None, default: None
cv : int, cross-validation generator or an iterable, optional
refit : boolean, or string default=True
random_state : int, RandomState instance or None, optional, default=None

模型

分类

RandomForestClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=50, n_jobs=-1)
scores = cross_val_score(rnd_clf, trainData, trainLabel,
                         scoring="recall", cv=10)
scores.mean()

ExtraTreesClassifier

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
etr_clf = ExtraTreesClassifier(n_estimators=500, max_leaf_nodes=50, n_jobs=-1)
etr_score = cross_val_score(etr_clf, trainData, trainLabel, scoring='recall', cv=10)
etr_score.mean()

GradientBoostingClassifier

1 2	from sklearn.ensemble import GradientBoostingClassifier gbdt = GradientBoostingClassifier(learning_rate=0.1,min_samples_leaf=2,max_depth=6,n_estimators=100)

XGBClassifier

from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics
xgb1 = XGBClassifier(learning_rate =0.05,
                     n_estimators=1000,
                     max_depth=3,
                     min_child_weight=1,
                     gamma=0.1,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective= 'binary:logistic',
                     nthread=4,
                     reg_alpha=0.001,
                     reg_lambda=0.001,
                     scale_pos_weight=1,
                     seed=27)
xgb1.fit(subTrain, subLabel)
xgb1_pred = xgb1.predict(trainData)
xgb1_pred_prob = xgb1.predict_proba(trainData)[:, 1]
print(metrics.accuracy_score(trainLabel, xgb1_pred))
print(metrics.roc_auc_score(trainLabel, xgb1_pred_prob))
xgb1_f2 = calc_f2(trainLabel, xgb1_pred)
print(xgb1_f2)

Lightgbm

import lightgbm as lgb
lgb_clf = lgb.LGBMClassifier(learning_rate=0.1,
                            boosting_type='gbdt',
                            objective='binary',
                            n_estimators=1000,
                            metric='auc',
                            max_depth=3,
                            num_leaves=5,
                            subsample=0.7,
                            colsample_bytree=0.7,
                            min_data_in_leaf=450,
                            feature_fraction=0.7,
                            bagging_fraction=0.7,
                            bagging_freq=6,
                            lambda_l1=1,
                            lambda_l2=0.001,
                            min_gain_to_split=0.265,
                            verbose=5,
                            is_unbalance=True)
lgb_clf.fit(subTrain, subLabel)
lgb_clf_pred = lgb_clf.predict(trainData)

模型融合

BaggingClassifier

from sklearn.ensemble import BaggingClassifier
bag_rnd = BaggingClassifier(rnd_clf, n_estimators=10, max_samples=1000, bootstrap=True, n_jobs=-1)
bag_rnd.fit(subTrain, subLabel)
rnd_pred = bag_rnd.predict(trainData)

VotingClassifier

from sklearn.ensemble import VotingClassifier
from xgboost.sklearn import XGBClassifier
xgb = XGBClassifier(learning_rate =0.1,
                     n_estimators=1000,
                     max_depth=3,
                     min_child_weight=1,
                     gamma=0.1,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective= 'binary:logistic',
                     nthread=4,
                     reg_alpha=0.001,
                     reg_lambda=0.001,
                     scale_pos_weight=1)

import lightgbm as lgb
lgb_clf = lgb.LGBMClassifier(learning_rate=0.1,
                            boosting_type='gbdt',
                            objective='binary',
                            n_estimators=1000,
                            metric='auc',
                            max_depth=3,
                            num_leaves=5,
                            subsample=0.7,
                            colsample_bytree=0.7,
                            min_data_in_leaf=450,
                            feature_fraction=0.7,
                            bagging_fraction=0.7,
                            bagging_freq=6,
                            lambda_l1=1,
                            lambda_l2=0.001,
                            min_gain_to_split=0.265,
                            verbose=5,
                            is_unbalance=True)

from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier(learning_rate=0.05, min_samples_split=320, min_samples_leaf=7, max_depth=7,
                                 max_features='sqrt', subsample=0.7, random_state=10)

vot = VotingClassifier(estimators=[('xgb', xgb), ('lgb', lgb_clf)], voting='soft')
vot.fit(trainData, trainLabel)
vot_pred = vot.predict(testData)
vot_pred = pd.DataFrame(vot_pred)
submData['acc_now_delinq'] = vot_pred
submData.to_csv(os.path.join(homePath, 'submit.csv'), index=False)

StackingClassifier

from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=110, min_samples_split=90, min_samples_leaf=15,max_depth=10,oob_score=True,max_features=10)

from xgboost.sklearn import XGBClassifier
xgb = XGBClassifier(learning_rate =0.05,
                     n_estimators=1000,
                     max_depth=3,
                     min_child_weight=1,
                     gamma=0.1,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective= 'binary:logistic',
                     nthread=4,
                     reg_alpha=0.001,
                     reg_lambda=0.001,
                     scale_pos_weight=1)

import lightgbm as lgb
lgb_clf = lgb.LGBMClassifier(learning_rate=0.1,
                            boosting_type='gbdt',
                            objective='binary',
                            n_estimators=1000,
                            metric='auc',
                            max_depth=3,
                            num_leaves=5,
                            subsample=0.7,
                            colsample_bytree=0.7,
                            min_data_in_leaf=450,
                            feature_fraction=0.7,
                            bagging_fraction=0.7,
                            bagging_freq=6,
                            lambda_l1=1,
                            lambda_l2=0.001,
                            min_gain_to_split=0.265,
                            verbose=5,
                            is_unbalance=True)

from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier(learning_rate=0.05, min_samples_split=320, min_samples_leaf=7, max_depth=7,
                                 max_features='sqrt', subsample=0.7)


from mlxtend.classifier import StackingClassifier
stack_clf = StackingClassifier(classifiers=[gbdt, lgb_clf, rnd_clf, xgb], meta_classifier=xgb)

stack_clf.fit(trainData, trainLabel)
stack_pred = stack_clf.predict(testData)

数据处理

特征放缩

MinMax scaling 归一化

该方法更容易受离散点影响
X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

X_scaled = X_std * (max - min) + min
>>> from sklearn.preprocessing import MinMaxScaler
>>>
>>> data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
>>> scaler = MinMaxScaler()
>>> print(scaler.fit(data))
MinMaxScaler(copy=True, feature_range=(0, 1))
>>> print(scaler.data_max_)
[  1.  18.]
>>> print(scaler.transform(data))
[[ 0.    0.  ]
 [ 0.25  0.25]
 [ 0.5   0.5 ]
 [ 1.    1.  ]]
>>> print(scaler.transform([[2, 2]]))
[[ 1.5  0. ]]

feature_range : tuple (min, max), default=(0, 1)，归一化后值的范围
copy : boolean, optional, default True，是否复制数据在新的数据上归一化

零均值标准化

>>> from sklearn.preprocessing import StandardScaler
>>>
>>> data = [[0, 0], [0, 0], [1, 1], [1, 1]]
>>> scaler = StandardScaler()
>>> print(scaler.fit(data))
StandardScaler(copy=True, with_mean=True, with_std=True)
>>> print(scaler.mean_)
[ 0.5  0.5]
>>> print(scaler.transform(data))
[[-1. -1.]
 [-1. -1.]
 [ 1.  1.]
 [ 1.  1.]]
>>> print(scaler.transform([[2, 2]]))
[[ 3.  3.]]

copy : boolean, optional, default True，是否复制数据在新的数据上执行
with_mean : boolean, True by default，若为 True 则在缩放前将数据居中。但在稀疏矩阵上是行不通的。
with_std : boolean, True by default，若为 True，则将数据放缩到单位方差或等效于单位标准差

处理空数据

Imputer() 处理丢失值

各属性必须是数值

from sklearn.preprocessing import Imputer
# 指定用何值替换丢失的值，此处为中位数
imputer = Imputer(strategy="median")

# 使实例适应数据
imputer.fit(housing_num)

# 结果在statistics_ 变量中
imputer.statistics_

# 替换
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
                          index = list(housing.index.values))

# 预览
housing_tr.loc[sample_incomplete_rows.index.values]

处理文本数据

pandas.factorize() 将输入值编码为枚举类型或分类变量

housing_cat = housing['ocean_proximity']
housing_cat.head(10)
# 输出
# 17606     <1H OCEAN
# 18632     <1H OCEAN
# 14650    NEAR OCEAN
# 3230         INLAND
# 3555      <1H OCEAN
# 19480        INLAND
# 8879      <1H OCEAN
# 13685        INLAND
# 4937      <1H OCEAN
# 4861      <1H OCEAN
# Name: ocean_proximity, dtype: object

housing_cat_encoded, housing_categories = housing_cat.factorize()
housing_cat_encoded[:10]
# 输出
# array([0, 0, 1, 2, 0, 2, 0, 2, 0, 0], dtype=int64)

参数

values : ndarray (1-d)；序列
sort : boolean, default False；根据值排序
na_sentinel : int, default -1；给未找到赋的值
size_hint : hint to the hashtable sizer

返回值

labels : the indexer to the original array
uniques : ndarray (1-d) or Index；当传递的值是 Index 或 Series 时，返回独特的索引。

OneHotEncoder 编码整数特征为 one-hot 向量

返回值为稀疏矩阵

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot

注意fit_transform()期望一个二维数组，所以这里将数据 reshape 了。

处理文本特征示例

housing_cat = housing['ocean_proximity']
housing_cat.head(10)
# 17606     <1H OCEAN
# 18632     <1H OCEAN
# 14650    NEAR OCEAN
# 3230         INLAND
# 3555      <1H OCEAN
# 19480        INLAND
# 8879      <1H OCEAN
# 13685        INLAND
# 4937      <1H OCEAN
# 4861      <1H OCEAN
# Name: ocean_proximity, dtype: object

housing_cat_encoded, housing_categories = housing_cat.factorize()
housing_cat_encoded[:10]
# array([0, 0, 1, 2, 0, 2, 0, 2, 0, 0], dtype=int64)

housing_categories
# Index(['<1H OCEAN', 'NEAR OCEAN', 'INLAND', 'NEAR BAY', 'ISLAND'], dtype='object')

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
print(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot
# [[0]
#  [0]
#  [1]
#  ...,
#  [2]
#  [0]
#  [3]]
# <16512x5 sparse matrix of type '<class 'numpy.float64'>'
# 	with 16512 stored elements in Compressed Sparse Row format>

LabelEncoder 标签编码

LabelEncoder`是一个可以用来将标签规范化的工具类，它可以将标签的编码值范围限定在[0,n_classes-1]。简单来说就是对不连续的数字或者文本进行编号。

>>> from sklearn import preprocessing
>>> le = preprocessing.LabelEncoder()
>>> le.fit([1, 2, 2, 6])
LabelEncoder()
>>> le.classes_
array([1, 2, 6])
>>> le.transform([1, 1, 2, 6])
array([0, 0, 1, 2])
>>> le.inverse_transform([0, 0, 1, 2])
array([1, 1, 2, 6])

当然，它也可以用于非数值型标签的编码转换成数值标签（只要它们是可哈希并且可比较的）:

>>> le = preprocessing.LabelEncoder()
>>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
LabelEncoder()
>>> list(le.classes_)
['amsterdam', 'paris', 'tokyo']
>>> le.transform(["tokyo", "tokyo", "paris"])
array([2, 2, 1])
>>> list(le.inverse_transform([2, 2, 1]))
['tokyo', 'tokyo', 'paris']

LabelBinarizer 标签二值化

LabelBinarizer 是一个用来从多类别列表创建标签矩阵的工具类:

>>> from sklearn import preprocessing
>>> lb = preprocessing.LabelBinarizer()
>>> lb.fit([1, 2, 6, 4, 2])
LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)
>>> lb.classes_
array([1, 2, 4, 6])
>>> lb.transform([1, 6])
array([[1, 0, 0, 0],
       [0, 0, 0, 1]])

对于多类别是实例，可以使用:class:MultiLabelBinarizer:

>>> lb = preprocessing.MultiLabelBinarizer()
>>> lb.fit_transform([(1, 2), (3,)])
array([[1, 1, 0],
       [0, 0, 1]])
>>> lb.classes_
array([1, 2, 3])

数据挖掘之机器学习

调参