import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # for plot
import plotly.express as px # for plot

import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, validation_curve
from sklearn.metrics import mean_squared_error

import rfpimp # for feature importance
from sklearn.tree import DecisionTreeRegressor # CART
from sklearn.ensemble import RandomForestRegressor # Random Forest

import xgboost
from xgboost import XGBRegressor # XGBoost

# set seed to make results reproducible
random_state = 101


# import Chicago Airbnb data
df = pd.read_csv('https://raw.githubusercontent.com/DanteChen0825/ChicagoAirbnb/main/data/airbnb.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5714 entries, 0 to 5713
Data columns (total 56 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   host_since                   5714 non-null   int64  
 1   host_is_superhost            5714 non-null   bool   
 2   host_listings_count          5714 non-null   int64  
 3   host_identity_verified       5714 non-null   bool   
 4   neighbourhood_cleansed       5714 non-null   object 
 5   latitude                     5714 non-null   float64
 6   longitude                    5714 non-null   float64
 7   property_type                5714 non-null   object 
 8   room_type                    5714 non-null   object 
 9   accommodates                 5714 non-null   int64  
 10  bathrooms_text               5714 non-null   float64
 11  bedrooms                     5714 non-null   int64  
 12  beds                         5714 non-null   int64  
 13  price                        5714 non-null   int64  
 14  minimum_nights               5714 non-null   int64  
 15  maximum_nights               5714 non-null   int64  
 16  availability_30              5714 non-null   int64  
 17  availability_60              5714 non-null   int64  
 18  availability_90              5714 non-null   int64  
 19  availability_365             5714 non-null   int64  
 20  number_of_reviews            5714 non-null   int64  
 21  number_of_reviews_ltm        5714 non-null   int64  
 22  number_of_reviews_l30d       5714 non-null   int64  
 23  review_scores_rating         5714 non-null   float64
 24  review_scores_accuracy       5714 non-null   float64
 25  review_scores_cleanliness    5714 non-null   float64
 26  review_scores_checkin        5714 non-null   float64
 27  review_scores_communication  5714 non-null   float64
 28  review_scores_location       5714 non-null   float64
 29  review_scores_value          5714 non-null   float64
 30  instant_bookable             5714 non-null   bool   
 31  reviews_per_month            5714 non-null   float64
 32  self_check_in                5714 non-null   bool   
 33  dishwasher                   5714 non-null   bool   
 34  bathtub                      5714 non-null   bool   
 35  heating                      5714 non-null   bool   
 36  wifi                         5714 non-null   bool   
 37  parking                      5714 non-null   bool   
 38  oven                         5714 non-null   bool   
 39  stove                        5714 non-null   bool   
 40  dryer                        5714 non-null   bool   
 41  air_condition                5714 non-null   bool   
 42  outdoor_furniture            5714 non-null   bool   
 43  shampoo                      5714 non-null   bool   
 44  bbq_grill                    5714 non-null   bool   
 45  hdtv                         5714 non-null   bool   
 46  netflix                      5714 non-null   bool   
 47  microwave                    5714 non-null   bool   
 48  free_parking                 5714 non-null   bool   
 49  refrigerator                 5714 non-null   bool   
 50  alarm                        5714 non-null   bool   
 51  coffee                       5714 non-null   bool   
 52  monoxide                     5714 non-null   bool   
 53  kitchen                      5714 non-null   bool   
 54  silverware                   5714 non-null   bool   
 55  first_aid_kit                5714 non-null   bool   
dtypes: bool(27), float64(11), int64(15), object(3)
memory usage: 1.4+ MB


# Chicago Airbnb distribution map
plt.clf()
fig = px.scatter_mapbox(df,
                        lat="latitude",
                        lon="longitude",
                        hover_name="neighbourhood_cleansed", hover_data=["bedrooms", "price"],
                        color="room_type", zoom=10, height=500)
fig.update_layout(mapbox_style="carto-positron")
fig.update_traces(marker=dict(size=5,opacity = 0.6))
fig.update_layout(title = 'Figure 1: Airbnb Map of Chicago, Illinois')
fig.show()

<Figure size 640x480 with 0 Axes>


# Chicago Airbnb density map
plt.clf()
fig = px.density_mapbox(df
                        ,lat='latitude'
                        ,lon='longitude'
                        ,range_color = [0, 40]
                        ,zoom=10, height=500
                        ,radius=20
                        ,opacity=0.8
                        ,mapbox_style='carto-positron')                        
fig.update_layout(title = 'Figure 2: Airbnb Density Map of Chicago, Illinois ')
fig.show()

<Figure size 640x480 with 0 Axes>


# plot price distribution of Airbnb price of Chicago
plt.figure(dpi=500, figsize=(10,6)) 
df['price'].plot.hist(grid=False, bins=30, rwidth=0.85,color='#2874A6')
plt.title('Figure 3: Airbnb Price Distribution of Chicago, Illinois')
plt.xlabel('Airbnb Price')
plt.ylabel('Airbnb Quantity')
plt.grid(axis='y', alpha=0.4)
plt.axvline(x=df['price'].median(), color="#F7DC6F", alpha=0.95)
plt.xticks([0,100,200,300,400,500,600,700,800,900,1000])
plt.text(140, 950, r'Median = $116')
plt.annotate('[Data Source: Inside Airbnb]', (0,0), (440,-25), fontsize=8, 
             xycoords='axes fraction', textcoords='offset points', va='top')
plt.box(False)
plt.axhline(0, color='black', linewidth=1)

<matplotlib.lines.Line2D at 0x7fc24a035a30>


# Airbnb factors Correlation matrix
df_corr = df.drop(['price'], axis=1)
corr = df_corr.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

/var/folders/x5/4d1q_0v93d74d3zzcsbhcx_m0000gn/T/ipykernel_1317/1928569508.py:4: FutureWarning:

this method is deprecated in favour of `Styler.format(precision=..)`


# create dummy variable for the categorical value
df_1 = pd.get_dummies(df, columns=['neighbourhood_cleansed', 'property_type', 'room_type'])
df_2 = df_1.drop(["neighbourhood_cleansed_Albany Park", "property_type_Casa particular", "room_type_Entire home/apt"], axis=1)
df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5714 entries, 0 to 5713
Columns: 170 entries, host_since to room_type_Shared room
dtypes: bool(27), float64(11), int64(15), uint8(117)
memory usage: 1.9 MB


# split the data into train set and test set
train_x, test_x, train_y, test_y = train_test_split(df_2.drop(['price'], axis = 1), df_2.price, random_state=random_state, test_size=0.25)

print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

(4285, 169)
(4285,)
(1429, 169)
(1429,)


# set hyperparameter max_depth and min_samples_split for grid search
hyperparameters = {'max_depth':[2,5,10,15,20,30,40,50], 'min_samples_split':[2,4,6,8,10,12,14,16,18,20]}

# grid searrfch with a 5-fold cross-validation
dt = DecisionTreeRegressor(random_state=random_state)
gs_cart = GridSearchCV(dt, hyperparameters, cv=5)
gs_cart.fit(train_x, train_y)

# print best hyperparameter value and corresponding accuracy score
print ("The best hyperparameter value is: ",gs_cart.best_params_)
print ("The best training score is: ", "%.4f" % gs_cart.best_score_)

# This chunk may take longer to run.
# Using Jupyter with the Anaconda environment on MacBook Pro with an M2 chip can be run in under 5 minutes.
# The expected output is following:
# The best hyperparameter value is:  {'max_depth': 5, 'min_samples_split': 16}
# The best training score is:  0.4761

The best hyperparameter value is:  {'max_depth': 5, 'min_samples_split': 16}
The best training score is:  0.4761


# use the best hyperparameter by grid search to train the CART model
dt_final = DecisionTreeRegressor(max_depth=5, min_samples_split=16, random_state=random_state)
dt_final.fit(train_x, train_y)

DecisionTreeRegressor(max_depth=5, min_samples_split=16, random_state=101)


# This chunk may take longer to run and can be skipped.
# Using Jupyter with the Anaconda environment on MacBook Pro with an M2 chip can be run in under 10 minutes.

# set hyperparameter max_depth and min_samples_split for grid search
#hyperparameters = {'max_depth':[5,10,20,30,40,50], 'min_samples_split':[2,4,6,8,10,15,20]}

# To reduce the running time of the grid search and cross-validation,
# the following alternative code that will test fewer combinations of hyperparameters, and produce the same output.
# If time is available using the original code above is highly encouraged.
# Using Jupyter with the Anaconda environment on MacBook Pro with an M2 chip can be run in under 5 minutes.

# set hyperparameter max_depth and min_samples_split for grid search
hyperparameters = {'max_depth':[20,30,40,50], 'min_samples_split':[2,4,6,8]}

# grid search with a 5-fold cross-validation
rf = RandomForestRegressor(random_state=random_state)
gs_rf = GridSearchCV(rf, hyperparameters, cv=5)
gs_rf.fit(train_x, train_y)

# print best hyperparameter value and corresponding accuracy score
print ("The best hyperparameter value is: ",gs_rf.best_params_)
print ("The best training score is: ", "%.4f" % gs_rf.best_score_)

# The expected output is following:
# The best hyperparameter value is:  {'max_depth': 40, 'min_samples_split': 2}
# The best training score is:  0.6433

The best hyperparameter value is:  {'max_depth': 40, 'min_samples_split': 2}
The best training score is:  0.6433


# use the best hyperparameter by grid search to train the Random Forest model
rf_final = RandomForestRegressor(max_depth=40, min_samples_split=2, random_state=random_state)
rf_final.fit(train_x, train_y)

RandomForestRegressor(max_depth=40, random_state=101)


# This chunk may take longer to run and can be skipped.
# Using Jupyter with the Anaconda environment on MacBook Pro with an M2 chip can be run in under 10 minutes.

# set hyperparameter max_depth and n_estimators for grid search
# hyperparameters =  {'max_depth':[5,10,20,30,40,50], 'n_estimators':[50,80,100,150,200,250,300]}

# To reduce the running time of the grid search and cross-validation,
# the following alternative code that will test fewer combinations of hyperparameters, and produce the same output.
# Using Jupyter with the Anaconda environment on MacBook Pro with an M2 chip can be run in under 5 minutes.
# If time is available using the original code above is highly encouraged.
# Using Jupyter with the Anaconda environment on MacBook Pro with an M2 chip can be run in under 5 minutes.

# set hyperparameter max_depth and n_estimators for grid search
hyperparameters =  {'max_depth':[5,10,20,30], 'n_estimators':[100,150,200,250]}

# grid search with a 5-fold cross-validation
xgb = XGBRegressor(random_state=random_state)
gscv_xgb = GridSearchCV(xgb, hyperparameters, cv=5)
gscv_xgb.fit(train_x, train_y)

# print best hyperparameter value and corresponding accuracy score
print ("The best parameter value is: ", gscv_xgb.best_params_)
print ("The best score is: ", "%.4f" % gscv_xgb.best_score_)

# The expected output is following:
# The best parameter value is:  {'max_depth': 5, 'n_estimators': 200}
# The best score is:  0.6493

The best parameter value is:  {'max_depth': 5, 'n_estimators': 200}
The best score is:  0.6493


# use the best hyperparameter by grid search to train the XGBoost model
xgb_final = XGBRegressor(max_depth=5, n_estimators=200, random_state=random_state)
xgb_final.fit(train_x, train_y)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=5, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=200, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=101, ...)


# print R2 of train and test data for three model
final_output_R2 = dict()
final_output_R2['Train'] = [dt_final.score(X=train_x, y=train_y), rf_final.score(X=train_x, y=train_y), xgb_final.score(X=train_x, y=train_y)]          
final_output_R2['Test'] = [dt_final.score(X=test_x, y=test_y), rf_final.score(X=test_x, y=test_y),xgb_final.score(X=test_x, y=test_y)]
final_output_R2 = pd.DataFrame.from_dict(final_output_R2, orient='index', columns=['CART', 'RF', 'XGB'])
final_output_R2
final_output_R2.style.set_caption("Table 2: R2 of Tree Models").format("{:.2%}")


# print the average Airbnb price
print ("The average Airbnb price of Chicago is: ", "%.1f" % df['price'].mean())

The average Airbnb price of Chicago is:  154.4


# print RMSE of train and test data for three model
final_output_RMSE = dict()
final_output_RMSE['Train'] = [mean_squared_error(train_y, dt_final.predict(train_x), squared=False),
                              mean_squared_error(train_y, rf_final.predict(train_x), squared=False), 
                              mean_squared_error(train_y, xgb_final.predict(train_x), squared=False)]          
final_output_RMSE['Test'] = [mean_squared_error(test_y, dt_final.predict(test_x), squared=False),
                             mean_squared_error(test_y, rf_final.predict(test_x), squared=False),
                             mean_squared_error(test_y, xgb_final.predict(test_x), squared=False)]
final_output_RMSE = pd.DataFrame.from_dict(final_output_RMSE, orient='index', columns=['CART', 'RF', 'XGB'])
final_output_RMSE
final_output_RMSE.style.set_caption("Table 3: RMSE of Tree Models").format("{:.2f}")


# Random Forest feature importance plot
rfpimp.plot_importances(rfpimp.importances(rf_final, test_x, test_y))

/Users/chenzhiang/opt/anaconda3/lib/python3.9/site-packages/rfpimp.py:52: UserWarning:

Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations.


# XGBoost feature importance plot
rfpimp.plot_importances(rfpimp.importances(xgb_final, test_x, test_y))

/Users/chenzhiang/opt/anaconda3/lib/python3.9/site-packages/rfpimp.py:52: UserWarning:

Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations.

	host_since	host_is_superhost	host_listings_count	host_identity_verified	latitude	longitude	accommodates	bathrooms_text	bedrooms	beds	minimum_nights	maximum_nights	availability_30	availability_60	availability_90	availability_365	number_of_reviews	number_of_reviews_ltm	number_of_reviews_l30d	review_scores_rating	review_scores_accuracy	review_scores_cleanliness	review_scores_checkin	review_scores_communication	review_scores_location	review_scores_value	instant_bookable	reviews_per_month	self_check_in	dishwasher	bathtub	heating	wifi	parking	oven	stove	dryer	air_condition	outdoor_furniture	shampoo	bbq_grill	hdtv	netflix	microwave	free_parking	refrigerator	alarm	coffee	monoxide	kitchen	silverware	first_aid_kit
host_since	1.00	-0.16	-0.03	-0.04	-0.11	0.06	0.04	0.01	0.04	0.02	-0.04	-0.12	0.12	0.14	0.14	0.08	-0.21	0.01	0.16	-0.07	-0.07	-0.04	-0.09	-0.07	-0.09	-0.05	0.13	0.10	-0.05	-0.01	0.04	-0.12	-0.01	-0.14	-0.01	0.01	-0.12	0.01	-0.11	-0.12	-0.10	0.04	0.02	-0.03	-0.01	-0.03	-0.07	-0.07	-0.23	-0.04	-0.03	0.07
host_is_superhost	-0.16	1.00	-0.14	0.01	0.11	-0.09	0.09	-0.00	0.09	0.12	-0.07	-0.06	-0.03	-0.01	-0.01	0.02	0.18	0.20	0.09	0.27	0.22	0.27	0.19	0.21	0.17	0.22	-0.07	0.16	0.13	0.08	0.12	0.10	0.07	0.20	0.14	0.16	0.12	-0.04	0.19	0.11	0.08	0.22	0.14	0.18	0.03	0.14	0.06	0.20	0.16	0.01	0.21	0.09
host_listings_count	-0.03	-0.14	1.00	0.06	-0.01	0.10	-0.10	-0.04	-0.08	-0.09	0.07	0.17	-0.10	-0.06	-0.03	0.09	-0.09	-0.10	-0.08	-0.13	-0.08	-0.07	-0.12	-0.14	-0.04	-0.18	-0.07	-0.14	0.09	0.13	0.04	0.02	0.02	-0.31	0.08	-0.25	0.03	0.06	-0.09	0.07	0.08	-0.11	-0.08	0.06	-0.12	0.05	0.02	0.06	0.05	0.03	-0.36	-0.19
host_identity_verified	-0.04	0.01	0.06	1.00	0.01	0.02	0.00	0.04	-0.01	-0.01	-0.02	-0.04	0.02	0.03	0.03	0.04	0.00	0.02	0.05	-0.01	-0.00	-0.00	-0.01	0.00	-0.01	-0.01	0.02	0.05	0.01	0.06	0.05	-0.01	0.01	-0.00	0.04	0.01	0.03	0.02	0.03	0.05	-0.02	0.04	0.06	0.03	-0.00	0.01	0.00	0.02	-0.02	-0.00	-0.01	0.08
latitude	-0.11	0.11	-0.01	0.01	1.00	-0.54	-0.01	-0.06	0.01	0.00	-0.03	-0.04	-0.07	-0.09	-0.10	-0.11	0.07	0.06	0.05	0.13	0.12	0.14	0.10	0.10	0.29	0.10	-0.06	0.05	-0.08	0.11	-0.03	0.04	0.01	-0.10	0.01	-0.02	0.08	-0.03	0.05	0.09	-0.00	-0.02	-0.01	-0.02	-0.11	0.02	-0.01	0.07	-0.12	-0.01	0.04	-0.02
longitude	0.06	-0.09	0.10	0.02	-0.54	1.00	0.03	0.09	-0.02	0.01	0.02	0.07	0.04	0.04	0.04	0.07	-0.04	0.00	0.03	-0.09	-0.09	-0.10	-0.10	-0.09	-0.01	-0.10	0.10	-0.01	0.00	0.07	0.00	0.02	-0.01	-0.08	0.01	-0.02	-0.01	0.06	-0.09	-0.03	0.01	-0.00	0.00	-0.00	-0.12	-0.04	0.00	-0.05	-0.01	-0.06	-0.09	-0.02
accommodates	0.04	0.09	-0.10	0.00	-0.01	0.03	1.00	0.50	0.82	0.84	-0.09	0.02	0.10	0.09	0.07	0.12	-0.00	0.05	0.06	0.06	0.03	0.04	0.04	0.05	0.04	0.02	-0.00	0.08	0.09	0.24	0.22	0.05	0.06	0.13	0.20	0.19	0.05	0.09	0.13	0.08	0.08	0.15	0.10	0.13	0.14	0.14	0.05	0.16	0.10	0.16	0.19	0.10
bathrooms_text	0.01	-0.00	-0.04	0.04	-0.06	0.09	0.50	1.00	0.54	0.54	-0.00	0.00	0.04	0.03	0.02	0.08	-0.06	-0.04	-0.02	0.02	-0.00	-0.02	0.01	0.00	-0.00	-0.00	0.07	-0.04	0.04	0.20	0.09	0.04	0.04	0.09	0.09	0.10	0.06	0.12	0.08	-0.03	0.12	0.06	0.04	0.07	0.11	0.07	0.05	0.05	0.08	0.11	0.07	0.06
bedrooms	0.04	0.09	-0.08	-0.01	0.01	-0.02	0.82	0.54	1.00	0.81	-0.07	0.02	0.07	0.06	0.04	0.08	-0.04	0.01	0.03	0.08	0.05	0.06	0.05	0.07	0.04	0.05	-0.03	0.02	0.05	0.25	0.23	0.06	0.05	0.11	0.21	0.19	0.06	0.09	0.13	0.06	0.09	0.14	0.09	0.13	0.17	0.16	0.07	0.17	0.08	0.18	0.19	0.10
beds	0.02	0.12	-0.09	-0.01	0.00	0.01	0.84	0.54	0.81	1.00	-0.08	0.02	0.08	0.07	0.05	0.09	0.00	0.05	0.05	0.07	0.05	0.05	0.06	0.07	0.05	0.05	-0.02	0.06	0.09	0.22	0.21	0.06	0.06	0.13	0.19	0.18	0.06	0.07	0.14	0.07	0.07	0.14	0.08	0.13	0.14	0.14	0.06	0.15	0.10	0.13	0.18	0.11
minimum_nights	-0.04	-0.07	0.07	-0.02	-0.03	0.02	-0.09	-0.00	-0.07	-0.08	1.00	0.11	-0.05	-0.06	-0.06	-0.04	-0.08	-0.13	-0.13	-0.01	-0.02	-0.02	-0.00	-0.02	-0.05	-0.03	0.01	-0.16	-0.08	-0.01	-0.08	0.02	-0.00	-0.04	-0.03	-0.05	0.01	0.02	-0.05	-0.01	0.00	-0.07	-0.07	-0.05	-0.00	-0.04	-0.00	-0.06	0.03	0.04	-0.08	-0.03
maximum_nights	-0.12	-0.06	0.17	-0.04	-0.04	0.07	0.02	0.00	0.02	0.02	0.11	1.00	-0.08	-0.08	-0.07	0.08	0.04	-0.02	-0.09	-0.07	-0.05	-0.06	-0.03	-0.05	-0.01	-0.07	-0.02	-0.09	0.08	0.03	-0.06	0.09	0.04	0.02	0.01	-0.05	0.07	0.07	-0.01	0.06	0.01	-0.06	-0.02	0.01	0.03	0.02	0.04	-0.00	0.10	0.05	-0.08	-0.10
availability_30	0.12	-0.03	-0.10	0.02	-0.07	0.04	0.10	0.04	0.07	0.08	-0.05	-0.08	1.00	0.93	0.87	0.49	-0.02	0.06	0.04	-0.05	-0.06	-0.04	-0.05	-0.04	-0.09	-0.09	0.08	0.04	-0.02	-0.02	0.00	-0.04	-0.04	-0.02	-0.04	-0.01	-0.03	-0.01	-0.00	-0.01	-0.02	0.05	0.05	-0.04	-0.00	-0.04	-0.02	-0.05	-0.04	-0.07	0.00	0.04
availability_60	0.14	-0.01	-0.06	0.03	-0.09	0.04	0.09	0.03	0.06	0.07	-0.06	-0.08	0.93	1.00	0.97	0.57	-0.03	0.07	0.07	-0.05	-0.06	-0.04	-0.06	-0.04	-0.11	-0.09	0.07	0.06	-0.01	-0.01	0.02	-0.02	-0.04	-0.03	-0.01	0.00	-0.02	-0.02	-0.00	-0.00	-0.01	0.06	0.06	-0.01	0.01	-0.01	0.00	-0.02	-0.04	-0.06	0.02	0.04
availability_90	0.14	-0.01	-0.03	0.03	-0.10	0.04	0.07	0.02	0.04	0.05	-0.06	-0.07	0.87	0.97	1.00	0.61	-0.04	0.07	0.07	-0.06	-0.06	-0.05	-0.07	-0.05	-0.12	-0.10	0.05	0.05	-0.02	-0.01	0.02	-0.01	-0.05	-0.04	0.00	-0.00	-0.01	-0.04	-0.02	0.00	-0.01	0.06	0.06	-0.00	0.00	0.01	0.01	-0.01	-0.04	-0.05	0.03	0.04
availability_365	0.08	0.02	0.09	0.04	-0.11	0.07	0.12	0.08	0.08	0.09	-0.04	0.08	0.49	0.57	0.61	1.00	-0.03	0.04	0.03	-0.06	-0.06	-0.05	-0.06	-0.06	-0.07	-0.11	0.09	0.02	0.04	0.04	0.03	0.00	0.02	0.02	0.05	-0.02	0.03	0.00	0.01	0.01	0.02	0.10	0.08	0.04	0.04	0.07	0.02	0.05	0.07	0.02	0.04	0.00
number_of_reviews	-0.21	0.18	-0.09	0.00	0.07	-0.04	-0.00	-0.06	-0.04	0.00	-0.08	0.04	-0.02	-0.03	-0.04	-0.03	1.00	0.71	0.36	0.07	0.08	0.07	0.09	0.08	0.08	0.10	-0.03	0.68	0.08	-0.00	-0.02	0.09	0.04	0.11	0.05	0.05	0.08	-0.01	0.07	0.14	-0.03	0.00	0.04	0.10	0.02	0.05	0.05	0.10	0.08	-0.08	0.09	0.02
number_of_reviews_ltm	0.01	0.20	-0.10	0.02	0.06	0.00	0.05	-0.04	0.01	0.05	-0.13	-0.02	0.06	0.07	0.07	0.04	0.71	1.00	0.65	0.06	0.07	0.07	0.06	0.06	0.09	0.08	0.03	0.85	0.11	0.03	0.04	0.06	0.05	0.04	0.05	0.06	0.06	-0.01	0.08	0.09	-0.06	0.08	0.09	0.09	-0.01	0.04	0.02	0.09	-0.01	-0.09	0.08	0.05
number_of_reviews_l30d	0.16	0.09	-0.08	0.05	0.05	0.03	0.06	-0.02	0.03	0.05	-0.13	-0.09	0.04	0.07	0.07	0.03	0.36	0.65	1.00	0.07	0.06	0.07	0.03	0.05	0.09	0.08	0.08	0.75	0.05	0.07	0.08	0.04	0.01	-0.03	0.08	0.09	0.04	-0.02	0.02	0.08	-0.08	0.10	0.11	0.08	-0.05	0.05	0.02	0.11	-0.10	-0.06	0.09	0.10
review_scores_rating	-0.07	0.27	-0.13	-0.01	0.13	-0.09	0.06	0.02	0.08	0.07	-0.01	-0.07	-0.05	-0.05	-0.06	-0.06	0.07	0.06	0.07	1.00	0.85	0.79	0.69	0.80	0.60	0.82	-0.07	0.08	0.04	0.10	0.07	0.08	0.06	0.07	0.09	0.12	0.12	0.01	0.12	0.07	0.07	0.11	0.07	0.09	0.02	0.11	0.07	0.16	0.01	0.02	0.18	0.10
review_scores_accuracy	-0.07	0.22	-0.08	-0.00	0.12	-0.09	0.03	-0.00	0.05	0.05	-0.02	-0.05	-0.06	-0.06	-0.06	-0.06	0.08	0.07	0.06	0.85	1.00	0.75	0.70	0.74	0.56	0.79	-0.06	0.09	0.06	0.10	0.05	0.08	0.07	0.04	0.08	0.10	0.11	0.02	0.09	0.06	0.06	0.11	0.07	0.09	0.01	0.10	0.06	0.15	0.02	0.02	0.14	0.07
review_scores_cleanliness	-0.04	0.27	-0.07	-0.00	0.14	-0.10	0.04	-0.02	0.06	0.05	-0.02	-0.06	-0.04	-0.04	-0.05	-0.05	0.07	0.07	0.07	0.79	0.75	1.00	0.59	0.63	0.47	0.68	-0.07	0.09	0.04	0.10	0.06	0.08	0.08	0.04	0.10	0.10	0.12	0.01	0.11	0.09	0.06	0.12	0.07	0.10	-0.00	0.10	0.08	0.16	-0.00	0.01	0.15	0.09
review_scores_checkin	-0.09	0.19	-0.12	-0.01	0.10	-0.10	0.04	0.01	0.05	0.06	-0.00	-0.03	-0.05	-0.06	-0.07	-0.06	0.09	0.06	0.03	0.69	0.70	0.59	1.00	0.72	0.50	0.62	-0.06	0.06	0.06	0.03	0.03	0.05	0.02	0.08	0.04	0.07	0.08	0.00	0.09	0.04	0.04	0.07	0.04	0.05	0.03	0.06	0.04	0.10	0.04	0.00	0.13	0.04
review_scores_communication	-0.07	0.21	-0.14	0.00	0.10	-0.09	0.05	0.00	0.07	0.07	-0.02	-0.05	-0.04	-0.04	-0.05	-0.06	0.08	0.06	0.05	0.80	0.74	0.63	0.72	1.00	0.50	0.70	-0.05	0.08	0.05	0.06	0.07	0.06	0.03	0.07	0.09	0.12	0.09	-0.01	0.09	0.07	0.02	0.09	0.07	0.09	0.04	0.10	0.06	0.14	0.01	0.03	0.18	0.10
review_scores_location	-0.09	0.17	-0.04	-0.01	0.29	-0.01	0.04	-0.00	0.04	0.05	-0.05	-0.01	-0.09	-0.11	-0.12	-0.07	0.08	0.09	0.09	0.60	0.56	0.47	0.50	0.50	1.00	0.58	-0.02	0.09	-0.00	0.13	0.01	0.07	0.02	-0.03	0.05	0.04	0.10	0.03	0.09	0.06	0.06	0.07	0.05	0.04	-0.08	0.05	0.07	0.13	-0.03	-0.04	0.09	0.04
review_scores_value	-0.05	0.22	-0.18	-0.01	0.10	-0.10	0.02	-0.00	0.05	0.05	-0.03	-0.07	-0.09	-0.09	-0.10	-0.11	0.10	0.08	0.08	0.82	0.79	0.68	0.62	0.70	0.58	1.00	-0.04	0.12	0.02	0.06	0.04	0.05	0.05	0.07	0.06	0.11	0.08	-0.00	0.10	0.04	0.03	0.09	0.07	0.07	0.03	0.07	0.06	0.12	0.01	0.00	0.17	0.10
instant_bookable	0.13	-0.07	-0.07	0.02	-0.06	0.10	-0.00	0.07	-0.03	-0.02	0.01	-0.02	0.08	0.07	0.05	0.09	-0.03	0.03	0.08	-0.07	-0.06	-0.07	-0.06	-0.05	-0.02	-0.04	1.00	0.08	0.07	-0.07	-0.06	-0.09	-0.01	0.02	-0.08	-0.03	-0.06	0.04	-0.06	-0.09	-0.06	-0.01	-0.00	-0.08	-0.02	-0.07	-0.05	-0.09	0.04	-0.09	-0.06	-0.00
reviews_per_month	0.10	0.16	-0.14	0.05	0.05	-0.01	0.08	-0.04	0.02	0.06	-0.16	-0.09	0.04	0.06	0.05	0.02	0.68	0.85	0.75	0.08	0.09	0.09	0.06	0.08	0.09	0.12	0.08	1.00	0.11	0.05	0.08	0.06	0.04	0.05	0.09	0.12	0.05	-0.02	0.06	0.12	-0.09	0.11	0.12	0.13	-0.02	0.08	0.03	0.13	-0.03	-0.08	0.14	0.10
self_check_in	-0.05	0.13	0.09	0.01	-0.08	0.00	0.09	0.04	0.05	0.09	-0.08	0.08	-0.02	-0.01	-0.02	0.04	0.08	0.11	0.05	0.04	0.06	0.04	0.06	0.05	-0.00	0.02	0.07	0.11	1.00	0.08	0.09	0.10	0.10	0.21	0.12	0.16	0.12	0.06	0.14	0.07	0.06	0.16	0.11	0.17	0.06	0.15	0.08	0.19	0.26	0.03	0.11	0.05
dishwasher	-0.01	0.08	0.13	0.06	0.11	0.07	0.24	0.20	0.25	0.22	-0.01	0.03	-0.02	-0.01	-0.01	0.04	-0.00	0.03	0.07	0.10	0.10	0.10	0.03	0.06	0.13	0.06	-0.07	0.05	0.08	1.00	0.24	0.20	0.05	0.05	0.54	0.43	0.20	0.13	0.13	0.18	0.12	0.15	0.14	0.46	-0.02	0.41	0.10	0.38	-0.01	0.24	0.31	0.11
bathtub	0.04	0.12	0.04	0.05	-0.03	0.00	0.22	0.09	0.23	0.21	-0.08	-0.06	0.00	0.02	0.02	0.03	-0.02	0.04	0.08	0.07	0.05	0.06	0.03	0.07	0.01	0.04	-0.06	0.08	0.09	0.24	1.00	0.13	0.05	0.12	0.33	0.28	0.10	-0.04	0.14	0.14	0.07	0.22	0.20	0.28	0.05	0.26	0.07	0.22	0.08	0.13	0.24	0.11
heating	-0.12	0.10	0.02	-0.01	0.04	0.02	0.05	0.04	0.06	0.06	0.02	0.09	-0.04	-0.02	-0.01	0.00	0.09	0.06	0.04	0.08	0.08	0.08	0.05	0.06	0.07	0.05	-0.09	0.06	0.10	0.20	0.13	1.00	0.08	0.13	0.28	0.26	0.48	0.04	0.07	0.32	0.02	0.10	0.08	0.32	-0.00	0.34	0.20	0.32	0.10	0.05	0.31	0.05
wifi	-0.01	0.07	0.02	0.01	0.01	-0.01	0.06	0.04	0.05	0.06	-0.00	0.04	-0.04	-0.04	-0.05	0.02	0.04	0.05	0.01	0.06	0.07	0.08	0.02	0.03	0.02	0.05	-0.01	0.04	0.10	0.05	0.05	0.08	1.00	0.08	0.06	0.04	0.09	0.13	0.05	0.04	0.04	0.07	0.05	0.09	0.05	0.05	0.18	0.09	0.15	0.08	0.04	0.04
parking	-0.14	0.20	-0.31	-0.00	-0.10	-0.08	0.13	0.09	0.11	0.13	-0.04	0.02	-0.02	-0.03	-0.04	0.02	0.11	0.04	-0.03	0.07	0.04	0.04	0.08	0.07	-0.03	0.07	0.02	0.05	0.21	0.05	0.12	0.13	0.08	1.00	0.15	0.26	0.10	-0.00	0.18	0.03	0.10	0.19	0.14	0.23	0.35	0.19	0.11	0.18	0.34	0.05	0.32	0.13
oven	-0.01	0.14	0.08	0.04	0.01	0.01	0.20	0.09	0.21	0.19	-0.03	0.01	-0.04	-0.01	0.00	0.05	0.05	0.05	0.08	0.09	0.08	0.10	0.04	0.09	0.05	0.06	-0.08	0.09	0.12	0.54	0.33	0.28	0.06	0.15	1.00	0.77	0.23	-0.02	0.13	0.24	0.13	0.19	0.15	0.67	-0.00	0.64	0.15	0.52	0.04	0.40	0.54	0.12
stove	0.01	0.16	-0.25	0.01	-0.02	-0.02	0.19	0.10	0.19	0.18	-0.05	-0.05	-0.01	0.00	-0.00	-0.02	0.05	0.06	0.09	0.12	0.10	0.10	0.07	0.12	0.04	0.11	-0.03	0.12	0.16	0.43	0.28	0.26	0.04	0.26	0.77	1.00	0.21	-0.03	0.17	0.18	0.12	0.21	0.15	0.60	0.01	0.58	0.13	0.47	0.02	0.38	0.63	0.24
dryer	-0.12	0.12	0.03	0.03	0.08	-0.01	0.05	0.06	0.06	0.06	0.01	0.07	-0.03	-0.02	-0.01	0.03	0.08	0.06	0.04	0.12	0.11	0.12	0.08	0.09	0.10	0.08	-0.06	0.05	0.12	0.20	0.10	0.48	0.09	0.10	0.23	0.21	1.00	0.06	0.08	0.33	0.04	0.11	0.08	0.28	-0.01	0.25	0.20	0.29	0.09	0.06	0.23	0.10
air_condition	0.01	-0.04	0.06	0.02	-0.03	0.06	0.09	0.12	0.09	0.07	0.02	0.07	-0.01	-0.02	-0.04	0.00	-0.01	-0.01	-0.02	0.01	0.02	0.01	0.00	-0.01	0.03	-0.00	0.04	-0.02	0.06	0.13	-0.04	0.04	0.13	-0.00	-0.02	-0.03	0.06	1.00	-0.02	0.00	0.03	-0.08	-0.04	0.01	0.01	-0.02	0.06	-0.01	0.09	0.01	-0.04	-0.01
outdoor_furniture	-0.11	0.19	-0.09	0.03	0.05	-0.09	0.13	0.08	0.13	0.14	-0.05	-0.01	-0.00	-0.00	-0.02	0.01	0.07	0.08	0.02	0.12	0.09	0.11	0.09	0.09	0.09	0.10	-0.06	0.06	0.14	0.13	0.14	0.07	0.05	0.18	0.13	0.17	0.08	-0.02	1.00	0.05	0.41	0.25	0.19	0.14	0.06	0.12	0.06	0.16	0.10	0.02	0.17	0.13
shampoo	-0.12	0.11	0.07	0.05	0.09	-0.03	0.08	-0.03	0.06	0.07	-0.01	0.06	-0.01	-0.00	0.00	0.01	0.14	0.09	0.08	0.07	0.06	0.09	0.04	0.07	0.06	0.04	-0.09	0.12	0.07	0.18	0.14	0.32	0.04	0.03	0.24	0.18	0.33	0.00	0.05	1.00	-0.03	0.09	0.11	0.27	-0.07	0.18	0.14	0.27	0.07	0.03	0.17	0.14
bbq_grill	-0.10	0.08	0.08	-0.02	-0.00	0.01	0.08	0.12	0.09	0.07	0.00	0.01	-0.02	-0.01	-0.01	0.02	-0.03	-0.06	-0.08	0.07	0.06	0.06	0.04	0.02	0.06	0.03	-0.06	-0.09	0.06	0.12	0.07	0.02	0.04	0.10	0.13	0.12	0.04	0.03	0.41	-0.03	1.00	0.10	0.06	0.12	0.04	0.14	0.05	0.14	0.08	0.09	0.11	0.05
hdtv	0.04	0.22	-0.11	0.04	-0.02	-0.00	0.15	0.06	0.14	0.14	-0.07	-0.06	0.05	0.06	0.06	0.10	0.00	0.08	0.10	0.11	0.11	0.12	0.07	0.09	0.07	0.09	-0.01	0.11	0.16	0.15	0.22	0.10	0.07	0.19	0.19	0.21	0.11	-0.08	0.25	0.09	0.10	1.00	0.62	0.20	0.04	0.22	0.07	0.23	0.12	0.08	0.26	0.16
netflix	0.02	0.14	-0.08	0.06	-0.01	0.00	0.10	0.04	0.09	0.08	-0.07	-0.02	0.05	0.06	0.06	0.08	0.04	0.09	0.11	0.07	0.07	0.07	0.04	0.07	0.05	0.07	-0.00	0.12	0.11	0.14	0.20	0.08	0.05	0.14	0.15	0.15	0.08	-0.04	0.19	0.11	0.06	0.62	1.00	0.18	0.04	0.17	0.05	0.18	0.09	0.06	0.20	0.14
microwave	-0.03	0.18	0.06	0.03	-0.02	-0.00	0.13	0.07	0.13	0.13	-0.05	0.01	-0.04	-0.01	-0.00	0.04	0.10	0.09	0.08	0.09	0.09	0.10	0.05	0.09	0.04	0.07	-0.08	0.13	0.17	0.46	0.28	0.32	0.09	0.23	0.67	0.60	0.28	0.01	0.14	0.27	0.12	0.20	0.18	1.00	-0.00	0.64	0.16	0.58	0.10	0.19	0.55	0.13
free_parking	-0.01	0.03	-0.12	-0.00	-0.11	-0.12	0.14	0.11	0.17	0.14	-0.00	0.03	-0.00	0.01	0.00	0.04	0.02	-0.01	-0.05	0.02	0.01	-0.00	0.03	0.04	-0.08	0.03	-0.02	-0.02	0.06	-0.02	0.05	-0.00	0.05	0.35	-0.00	0.01	-0.01	0.01	0.06	-0.07	0.04	0.04	0.04	-0.00	1.00	0.02	0.04	0.00	0.14	0.08	0.09	0.05
refrigerator	-0.03	0.14	0.05	0.01	0.02	-0.04	0.14	0.07	0.16	0.14	-0.04	0.02	-0.04	-0.01	0.01	0.07	0.05	0.04	0.05	0.11	0.10	0.10	0.06	0.10	0.05	0.07	-0.07	0.08	0.15	0.41	0.26	0.34	0.05	0.19	0.64	0.58	0.25	-0.02	0.12	0.18	0.14	0.22	0.17	0.64	0.02	1.00	0.18	0.64	0.08	0.31	0.64	0.11
alarm	-0.07	0.06	0.02	0.00	-0.01	0.00	0.05	0.05	0.07	0.06	-0.00	0.04	-0.02	0.00	0.01	0.02	0.05	0.02	0.02	0.07	0.06	0.08	0.04	0.06	0.07	0.06	-0.05	0.03	0.08	0.10	0.07	0.20	0.18	0.11	0.15	0.13	0.20	0.06	0.06	0.14	0.05	0.07	0.05	0.16	0.04	0.18	1.00	0.17	0.39	0.12	0.17	0.15
coffee	-0.07	0.20	0.06	0.02	0.07	-0.05	0.16	0.05	0.17	0.15	-0.06	-0.00	-0.05	-0.02	-0.01	0.05	0.10	0.09	0.11	0.16	0.15	0.16	0.10	0.14	0.13	0.12	-0.09	0.13	0.19	0.38	0.22	0.32	0.09	0.18	0.52	0.47	0.29	-0.01	0.16	0.27	0.14	0.23	0.18	0.58	0.00	0.64	0.17	1.00	0.09	0.18	0.57	0.13
monoxide	-0.23	0.16	0.05	-0.02	-0.12	-0.01	0.10	0.08	0.08	0.10	0.03	0.10	-0.04	-0.04	-0.04	0.07	0.08	-0.01	-0.10	0.01	0.02	-0.00	0.04	0.01	-0.03	0.01	0.04	-0.03	0.26	-0.01	0.08	0.10	0.15	0.34	0.04	0.02	0.09	0.09	0.10	0.07	0.08	0.12	0.09	0.10	0.14	0.08	0.39	0.09	1.00	0.06	0.07	0.04
kitchen	-0.04	0.01	0.03	-0.00	-0.01	-0.06	0.16	0.11	0.18	0.13	0.04	0.05	-0.07	-0.06	-0.05	0.02	-0.08	-0.09	-0.06	0.02	0.02	0.01	0.00	0.03	-0.04	0.00	-0.09	-0.08	0.03	0.24	0.13	0.05	0.08	0.05	0.40	0.38	0.06	0.01	0.02	0.03	0.09	0.08	0.06	0.19	0.08	0.31	0.12	0.18	0.06	1.00	0.34	0.03
silverware	-0.03	0.21	-0.36	-0.01	0.04	-0.09	0.19	0.07	0.19	0.18	-0.08	-0.08	0.00	0.02	0.03	0.04	0.09	0.08	0.09	0.18	0.14	0.15	0.13	0.18	0.09	0.17	-0.06	0.14	0.11	0.31	0.24	0.31	0.04	0.32	0.54	0.63	0.23	-0.04	0.17	0.17	0.11	0.26	0.20	0.55	0.09	0.64	0.17	0.57	0.07	0.34	1.00	0.19
first_aid_kit	0.07	0.09	-0.19	0.08	-0.02	-0.02	0.10	0.06	0.10	0.11	-0.03	-0.10	0.04	0.04	0.04	0.00	0.02	0.05	0.10	0.10	0.07	0.09	0.04	0.10	0.04	0.10	-0.00	0.10	0.05	0.11	0.11	0.05	0.04	0.13	0.12	0.24	0.10	-0.01	0.13	0.14	0.05	0.16	0.14	0.13	0.05	0.11	0.15	0.13	0.04	0.03	0.19	1.00

	CART	RF	XGB
Train	60.70%	95.24%	98.77%
Test	56.64%	69.89%	71.18%

	CART	RF	XGB
Train	80.16	27.89	14.16
Test	88.06	73.37	71.79

Airbnb Price Prediction Based on Tree Model and Analysis of Influencing Factors, Evidence from Chicago

Introduction¶

Literature Review¶

Research Question¶

Presentation of Data¶

Table 1: Correlation matrix of Airbnb factors¶

Methodology¶

Classification And Regression Tree¶

Random Forest¶

XGBoost¶

Feature Importance Analysis¶

Results and Discussion¶

Figure 4: Airbnb Feature Importance Plot of Random Forest Model¶

Figure 5: Airbnb Feature Importance Plot of XGBoost Model¶

Conclusion¶

Reference¶