https://www.kaggle.com/dfitzgerald3/house-prices-advanced-regression-techniques/randomforestregressor/notebook
In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import Imputer
from scipy.stats import skew
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
In [2]:
train = '../input/train.csv'
test = '../input/test.csv'
df_train = pd.read_csv(train)
df_test = pd.read_csv(test)
In [3]:
def is_outlier(points, thresh = 3.5):
if len(points.shape) == 1:
points = points[:,None]
median = np.median(points, axis=0)
diff = np.sum((points - median)**2, axis=-1)
diff = np.sqrt(diff)
med_abs_deviation = np.median(diff)
modified_z_score = 0.6745 * diff / med_abs_deviation
return modified_z_score > thresh
In [4]:
target = df_train[df_train.columns.values[-1]]
target_log = np.log(target)
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
sns.distplot(target, bins=50)
plt.title('Original Data')
plt.xlabel('Sale Price')
plt.subplot(1,2,2)
sns.distplot(target_log, bins=50)
plt.title('Natural Log of Data')
plt.xlabel('Natural Log of Sale Price')
plt.tight_layout()
In [5]:
df_train = df_train[df_train.columns.values[:-1]]
df = df_train.append(df_test, ignore_index = True)
In [6]:
cats = []
for col in df.columns.values:
if df[col].dtype == 'object':
cats.append(col)
In [7]:
df_cont = df.drop(cats, axis=1)
df_cat = df[cats]
In [8]:
for col in df_cont.columns.values:
if np.sum(df_cont[col].isnull()) > 50:
df_cont = df_cont.drop(col, axis = 1)
elif np.sum(df_cont[col].isnull()) > 0:
median = df_cont[col].median()
idx = np.where(df_cont[col].isnull())[0]
df_cont[col].iloc[idx] = median
outliers = np.where(is_outlier(df_cont[col]))
df_cont[col].iloc[outliers] = median
if skew(df_cont[col]) > 0.75:
df_cont[col] = np.log(df_cont[col])
df_cont[col] = df_cont[col].apply(lambda x: 0 if x == -np.inf else x)
df_cont[col] = Normalizer().fit_transform(df_cont[col].reshape(1,-1))[0]
In [9]:
for col in df_cat.columns.values:
if np.sum(df_cat[col].isnull()) > 50:
df_cat = df_cat.drop(col, axis = 1)
continue
elif np.sum(df_cat[col].isnull()) > 0:
df_cat[col] = df_cat[col].fillna('MIA')
df_cat[col] = LabelEncoder().fit_transform(df_cat[col])
num_cols = df_cat[col].max()
for i in range(num_cols):
col_name = col + '_' + str(i)
df_cat[col_name] = df_cat[col].apply(lambda x: 1 if x == i else 0)
df_cat = df_cat.drop(col, axis = 1)
In [10]:
df_new = df_cont.join(df_cat)
df_train = df_new.iloc[:len(df_train) - 1]
df_train = df_train.join(target_log)
df_test = df_new.iloc[len(df_train) + 1:]
X_train = df_train[df_train.columns.values[1:-1]]
y_train = df_train[df_train.columns.values[-1]]
X_test = df_test[df_test.columns.values[1:]]
In [11]:
from sklearn.metrics import make_scorer, mean_squared_error
scorer = make_scorer(mean_squared_error, False)
clf = RandomForestRegressor(n_estimators=500, n_jobs=-1)
cv_score = np.sqrt(-cross_val_score(estimator=clf, X=X_train, y=y_train, cv=15, scoring = scorer))
plt.figure(figsize=(10,5))
plt.bar(range(len(cv_score)), cv_score)
plt.title('Cross Validation Score')
plt.ylabel('RMSE')
plt.xlabel('Iteration')
plt.plot(range(len(cv_score) + 1), [cv_score.mean()] * (len(cv_score) + 1))
plt.tight_layout()
In [12]:
# 트레이닝 데이터로 모델 학습
clf.fit(X_train, y_train)
# 변수 중요도 계수(feature importance coefficients)출력, 해당 변수명 할당, 그리고 값 정렬.
coef = pd.Series(clf.feature_importances_, index = X_train.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 5))
coef.head(25).plot(kind='bar')
plt.title('Feature Significance')
plt.tight_layout()
In [13]:
from sklearn.cross_validation import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train)
clf = RandomForestRegressor(n_estimators=500, n_jobs=-1)
clf.fit(X_train1, y_train1)
y_pred = clf.predict(X_test1)
plt.figure(figsize=(10, 5))
plt.scatter(y_test1, y_pred, s=20)
plt.title('Predicted vs. Actual')
plt.xlabel('Actual Sale Price')
plt.ylabel('Predicted Sale Price')
plt.plot([min(y_test1), max(y_test1)], [min(y_test1), max(y_test1)])
plt.tight_layout()
In [14]:
'Kaggle > House Prices' 카테고리의 다른 글
Using XGBoost For Feature Selection by Mei-Cheng Shih (With Python) (0) | 2016.12.01 |
---|---|
A study on Regression applied to the Ames dataset by juliencs (With Python) (0) | 2016.11.27 |
Fun with Real Estate by Stephanie Kirmer (With R) (0) | 2016.11.16 |
Detailed Data Exploration in Python by Angela (With Python) (0) | 2016.11.15 |
Ensemble Modeling : Stack Model Example by J.Thompson (with R) (0) | 2016.11.13 |