# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # 선형대수linear algebra
import pandas as pd # 데이터처리, CSV 파일 I/O(예를 들어 read_csv)
# Input data are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))
# Any results you write to the current directory are saved as output.
df = pd.read_csv('../input/train.csv')
#df.drop('SalePrice', axis = 1, inplace = True)
#test = pd.read_csv('../input/test.csv')
#df = df.append(test, ignore_index = True)
df.head()
df.describe()
df.columns
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
print("Some Statistics of the Housing Price:\n")
print(df['SalePrice'].describe()) # 종속 변수 SalePrice의 기초통계량 출력.
print("\nThe median of the Housing Price is: ", df['SalePrice'].median(axis = 0))
sns.distplot(df['SalePrice'], kde = False, color = 'b', hist_kws={'alpha': 0.9})
# Id 를 제외한 수치형변수들간 상관계수
corr = df.select_dtypes(include = ['float64', 'int64']).iloc[:, 1:].corr()
plt.figure(figsize=(12, 12))
sns.heatmap(corr, vmax=1, square=True)
cor_dict = corr['SalePrice'].to_dict()
del cor_dict['SalePrice']
# SalePrice와의 상관계수를 내림차순으로 정렬.
print("List the numerical features decendingly by their correlation with Sale Price:\n")
for ele in sorted(cor_dict.items(), key = lambda x: -abs(x[1])):
print("{0}: \t{1}".format(*ele))
# SalePrice에 대한 OverallQuall의 회귀선
sns.regplot(x = 'OverallQual', y = 'SalePrice', data = df, color = 'Orange')
# 높은 상관관계를 지닌 변수들의 SalePrice에 대한 산점도.
plt.figure(1)
f, axarr = plt.subplots(3, 2, figsize=(10, 9))
price = df.SalePrice.values
axarr[0, 0].scatter(df.GrLivArea.values, price)
axarr[0, 0].set_title('GrLiveArea')
axarr[0, 1].scatter(df.GarageArea.values, price)
axarr[0, 1].set_title('GarageArea')
axarr[1, 0].scatter(df.TotalBsmtSF.values, price)
axarr[1, 0].set_title('TotalBsmtSF')
axarr[1, 1].scatter(df['1stFlrSF'].values, price)
axarr[1, 1].set_title('1stFlrSF')
axarr[2, 0].scatter(df.TotRmsAbvGrd.values, price)
axarr[2, 0].set_title('TotRmsAbvGrd')
axarr[2, 1].scatter(df.MasVnrArea.values, price)
axarr[2, 1].set_title('MasVnrArea')
f.text(-0.01, 0.5, 'Sale Price', va='center', rotation='vertical', fontsize = 12)
plt.tight_layout()
plt.show()
fig = plt.figure(2, figsize=(9, 7))
plt.subplot(211)
plt.scatter(df.YearBuilt.values, price)
plt.title('YearBuilt')
plt.subplot(212)
plt.scatter(df.YearRemodAdd.values, price)
plt.title('YearRemodAdd')
fig.text(-0.01, 0.5, 'Sale Price', va = 'center', rotation = 'vertical', fontsize = 12)
plt.tight_layout()
print(df.select_dtypes(include=['object']).columns.values)
plt.figure(figsize = (12, 6))
sns.boxplot(x = 'Neighborhood', y = 'SalePrice', data = df)
xt = plt.xticks(rotation=45)
plt.figure(figsize = (12, 6))
sns.countplot(x = 'Neighborhood', data = df)
xt = plt.xticks(rotation=45)
fig, ax = plt.subplots(2, 1, figsize = (10, 6))
sns.boxplot(x = 'SaleType', y = 'SalePrice', data = df, ax = ax[0])
sns.boxplot(x = 'SaleCondition', y = 'SalePrice', data = df, ax = ax[1])
plt.tight_layout()
g = sns.FacetGrid(df, col = 'YrSold', col_wrap = 3)
g.map(sns.boxplot, 'MoSold', 'SalePrice', palette='Set2', order = range(1, 13))\
.set(ylim = (0, 500000))
plt.tight_layout()
fig, ax = plt.subplots(2, 1, figsize = (10, 8))
sns.boxplot(x = 'BldgType', y = 'SalePrice', data = df, ax = ax[0])
sns.boxplot(x = 'HouseStyle', y = 'SalePrice', data = df, ax = ax[1])
fig, ax = plt.subplots(2, 1, figsize = (10, 8))
sns.boxplot(x = 'Condition1', y = 'SalePrice', data = df, ax = ax[0])
sns.boxplot(x = 'Exterior1st', y = 'SalePrice', data = df, ax = ax[1])
x = plt.xticks(rotation = 45)
plt.show()
fig, ax = plt.subplots(2, 2, figsize = (10, 8))
sns.boxplot('BsmtCond', 'SalePrice', data = df, ax = ax[0, 0])
sns.boxplot('BsmtQual', 'SalePrice', data = df, ax = ax[0, 1])
sns.boxplot('BsmtExposure', 'SalePrice', data = df, ax = ax[1, 0])
sns.boxplot('BsmtFinType1', 'SalePrice', data = df, ax = ax[1, 1])
sns.violinplot('Functional', 'SalePrice', data = df)
sns.factorplot('FireplaceQu', 'SalePrice', data = df, color = 'm', \
estimator = np.median, order = ['Ex', 'Gd', 'TA', 'Fa', 'Po'], size = 4.5, aspect=1.35)
pd.crosstab(df.Fireplaces, df.FireplaceQu) # 벽낸로 갯수와 질 교차표.
g = sns.FacetGrid(df, col = 'FireplaceQu', col_wrap = 3, col_order=['Ex', 'Gd', 'TA', 'Fa', 'Po'])
g.map(sns.boxplot, 'Fireplaces', 'SalePrice', order = [1, 2, 3], palette = 'Set2')
pd.crosstab(df.HeatingQC, df.CentralAir) # 중앙난방 유무와 난방의 질과 상태의 교차표
pd.crosstab(df.HeatingQC, df.FireplaceQu) # 벽난로의 질과 난방의 질과 상태의 교차표
sns.factorplot('HeatingQC', 'SalePrice', hue = 'CentralAir', estimator = np.mean, data = df,
size = 4.5, aspect = 1.4)
fig, ax = plt.subplots(1, 2, figsize = (10, 4))
sns.boxplot('Electrical', 'SalePrice', data = df, ax = ax[0]).set(ylim = (0, 400000))
sns.countplot('Electrical', data = df)
plt.tight_layout()
sns.factorplot('KitchenQual', 'SalePrice', estimator = np.mean,
size = 4.5, aspect = 1.4, data = df, order = ['Ex', 'Gd', 'TA', 'Fa'])
sns.boxplot(x = 'MSZoning', y = 'SalePrice', data = df)
fig, ax = plt.subplots(1, 2, figsize = (10, 4))
sns.boxplot(x = 'Street', y = 'SalePrice', data = df, ax = ax[0])
sns.boxplot(x = 'Alley', y = 'SalePrice', data = df, ax = ax[1])
plt.tight_layout()
# Alley에서 결측값이 많다. Alley가 NA일때, Street은 'Pave'.
print("The NA's in Alley is: ", df['Alley'].isnull().sum())
print("\nThere are so many NA's in Alley. When Alley is NA, Street = ",
df[df.Alley.notnull()].Street.unique())
print("\n", pd.crosstab(df.Street, df.Alley))
'Kaggle > House Prices' 카테고리의 다른 글
RandomForestRegressor by BradenFitz-Gerald (With Python) (0) | 2016.11.17 |
---|---|
Fun with Real Estate by Stephanie Kirmer (With R) (0) | 2016.11.16 |
Ensemble Modeling : Stack Model Example by J.Thompson (with R) (0) | 2016.11.13 |
Housing Data Exploratory Analysis by AiO (With R) (0) | 2016.11.13 |
Boruta Feature Importance Analysis by Jim Thompson (With R) (0) | 2016.11.09 |