import numpy as np
import scipy as sp
import pandas as pd
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from xgboost import XGBRegressor
from pandas.core.common import random_state
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
import pickle
the_path = "C:/Users/rsb84/Desktop/RB/portfolio_data/nigeria_news/"
export_path = "C:/Users/rsb84/Desktop/RB/portfolio_data/nigeria_news/export/"
df = pd.read_csv('Nigeria2019_ACLED_Extract.csv')
event_types = df[['week', 'event_type', 'sub_event_type']].copy()
event_types.head()
week | event_type | sub_event_type | |
---|---|---|---|
0 | 1 | Battles | Armed clash |
1 | 1 | Battles | Armed clash |
2 | 1 | Explosions/Remote violence | Air/drone strike |
3 | 1 | Violence against civilians | Attack |
4 | 1 | Battles | Armed clash |
event_types = event_types.loc[~(event_types['event_type'] == 'Strategic developments'),:]
event_types = event_types.loc[~(event_types['sub_event_type'] == 'Peaceful protest'),:]
event_types = event_types.loc[~(event_types['sub_event_type'] == 'Protest with intervention'),:]
event_types_dummies = pd.get_dummies(event_types[['event_type']])
event_types_dummies.head()
event_type_Battles | event_type_Explosions/Remote violence | event_type_Protests | event_type_Riots | event_type_Violence against civilians | |
---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 |
1 | 1 | 0 | 0 | 0 | 0 |
2 | 0 | 1 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 1 |
4 | 1 | 0 | 0 | 0 | 0 |
event_types_dummies.shape
(1492, 5)
event_types.shape
(1492, 3)
event_types_combined = pd.concat([event_types[['week']], event_types_dummies], axis=1)
event_types_combined.shape
(1492, 6)
event_types_combined.head()
week | event_type_Battles | event_type_Explosions/Remote violence | event_type_Protests | event_type_Riots | event_type_Violence against civilians | |
---|---|---|---|---|---|---|
0 | 1 | 1 | 0 | 0 | 0 | 0 |
1 | 1 | 1 | 0 | 0 | 0 | 0 |
2 | 1 | 0 | 1 | 0 | 0 | 0 |
3 | 1 | 0 | 0 | 0 | 0 | 1 |
4 | 1 | 1 | 0 | 0 | 0 | 0 |
event_types_by_week = event_types_combined.groupby(['week']).sum()
event_types_by_week.shape
(53, 5)
event_types_by_week.head()
event_type_Battles | event_type_Explosions/Remote violence | event_type_Protests | event_type_Riots | event_type_Violence against civilians | |
---|---|---|---|---|---|
week | |||||
1 | 18 | 4 | 0 | 1 | 5 |
2 | 10 | 0 | 0 | 1 | 6 |
3 | 11 | 0 | 0 | 7 | 11 |
4 | 17 | 1 | 1 | 6 | 6 |
5 | 10 | 3 | 0 | 5 | 13 |
event_types_by_week = event_types_by_week.reset_index()
event_types_by_week = pd.DataFrame(event_types_by_week)
newsfeed = pd.read_csv("newsfeed.csv")
newsfeed.columns
Index(['week', 'text'], dtype='object')
newsfeed = newsfeed.sort_values(by="week")
newsfeed.shape
(24355, 2)
newsfeed_by_week = newsfeed.groupby('week')['text'].apply(lambda x:x.str.cat(sep=" "))
newsfeed_by_week = pd.DataFrame(newsfeed_by_week)
newsfeed_by_week = newsfeed_by_week.reset_index()
newsfeed_by_week['week'].values
array([ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53], dtype=int64)
event_types_by_week['week'].values
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53], dtype=int64)
event_types_by_week_50_wks = event_types_by_week.iloc[3: , :] #This removes the first 3 rows (weeks 1 - 3) from the dataframe
event_types_by_week_50_wks.head()
week | event_type_Battles | event_type_Explosions/Remote violence | event_type_Protests | event_type_Riots | event_type_Violence against civilians | |
---|---|---|---|---|---|---|
3 | 4 | 17 | 1 | 1 | 6 | 6 |
4 | 5 | 10 | 3 | 0 | 5 | 13 |
5 | 6 | 15 | 1 | 0 | 10 | 24 |
6 | 7 | 13 | 2 | 0 | 4 | 49 |
7 | 8 | 21 | 2 | 0 | 4 | 27 |
event_types_by_week_50_wks.shape
(50, 6)
import nltk
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\rsb84\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
True
def rem_sw(var):
from nltk.corpus import stopwords
sw = stopwords.words('english')
sw = set(sw)
new_sw = {"said", "says", "saying", "say", "us", "since", "like", "likes", "people"}
sw = sw.union(new_sw)
tmp = var.split() #tokenizinng words here
fin_var = [word for word in tmp if word not in sw] #removing stopwords
fin_var = ' '.join(fin_var) #rejoining tokenized words (without stopwords)
return fin_var
newsfeed_by_week['text_lower'] = newsfeed_by_week.text.apply(lambda x: x.lower())
newsfeed_by_week['text_sw'] = newsfeed_by_week.text_lower.apply(rem_sw)
the_column1 = 'text_sw'
def my_tf_idf_sw(var, path_in):
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import pickle
my_tf_idf = TfidfVectorizer()
my_tf_idf_vec = pd.DataFrame(my_tf_idf.fit_transform(var).toarray())
my_tf_idf_vec.columns = my_tf_idf.get_feature_names_out()
pickle.dump(my_tf_idf, open(path_in + "tf_idf_sw.pkl", "wb"))
pickle.dump(my_tf_idf_vec, open(path_in + "tf_idf_df_sw.pkl", "wb" ) )
return my_tf_idf_vec
news_tf_idf_sw = my_tf_idf_sw(newsfeed_by_week[the_column1], export_path)
news_tf_idf_sw_fd = news_tf_idf_sw.diff(periods=1)
news_tf_idf_sw_fd.shape
(52, 36829)
news_tf_idf_sw_fd = news_tf_idf_sw_fd.dropna(axis=0)
news_tf_idf_sw_fd.shape #Now week 2 has been deleted by dropping the first row of missing values left by first differencing
(51, 36829)
'''
pickle.dump(news_tf_idf_sw_fd, open(export_path + "news_tf_idf_sw_fd.pkl", "wb"))
'''
news_tf_idf_sw_fd = pickle.load(open(export_path + "news_tf_idf_sw_fd.pkl", "rb"))
#Here, I create the target variables I will use my newsfeed datasets to attempt to predict using supervised machine learning models
y_battles = event_types_by_week_50_wks[['event_type_Battles']].copy()
y_explosions = event_types_by_week_50_wks[['event_type_Explosions/Remote violence']].copy()
y_protests = event_types_by_week_50_wks[['event_type_Protests']].copy()
y_riots = event_types_by_week_50_wks[['event_type_Riots']].copy()
y_vac = event_types_by_week_50_wks[['event_type_Violence against civilians']].copy()
print(y_battles.shape)
print(y_explosions.shape)
print(y_protests.shape)
print(y_riots.shape)
print(y_vac.shape)
(50, 1) (50, 1) (50, 1) (50, 1) (50, 1)
'''
pickle.dump(y_battles, open(export_path + "y_battles_50_wks.pkl", "wb"))
pickle.dump(y_explosions, open(export_path + "y_explosions_50_wks.pkl", "wb"))
pickle.dump(y_protests, open(export_path + "y_protests_50_wks.pkl", "wb"))
pickle.dump(y_riots, open(export_path + "y_riots_50_wks.pkl", "wb"))
pickle.dump(y_vac, open(export_path + "y_vac_50_wks.pkl", "wb"))
'''
y_battle = pickle.load(open(export_path + "y_battles_50_wks.pkl", "rb"))
y_explosions = pickle.load(open(export_path + "y_explosions_50_wks.pkl", "rb"))
y_protests = pickle.load(open(export_path + "y_protests_50_wks.pkl", "rb"))
y_riots = pickle.load(open(export_path + "y_riots_50_wks.pkl", "rb"))
y_vac = pickle.load(open(export_path + "y_vac_50_wks.pkl", "rb"))
from statsmodels.tsa.stattools import adfuller
adf_test = adfuller(y_battles)
print("ADF = " + str(adf_test[0]))
print("p-value = " +str(adf_test[1]))
ADF = -8.038047205293735 p-value = 1.87939942870067e-12
adf_test = adfuller(y_explosions)
print("ADF = " + str(adf_test[0]))
print("p-value = " +str(adf_test[1]))
ADF = -7.011085577942736 p-value = 6.919047048561405e-10
adf_test = adfuller(y_protests)
print("ADF = " + str(adf_test[0]))
print("p-value = " +str(adf_test[1])) #We cannot reject the null hypothesis of a unit root and non-statonarity since the p-value > 0.05.
ADF = -2.166443849384345 p-value = 0.21864897167202985
adf_test = adfuller(y_riots)
print("ADF = " + str(adf_test[0]))
print("p-value = " +str(adf_test[1]))
ADF = -5.660105937292464 p-value = 9.422640208691123e-07
adf_test = adfuller(y_vac)
print("ADF = " + str(adf_test[0]))
print("p-value = " +str(adf_test[1])) #We cannot reject the null hypothesis of a unit root and non-statonarity since the p-value > 0.05.
ADF = -1.6687567391397395 p-value = 0.4472775450272684
#We need to first difference y_protests and y_vac to make them stationary, but since I do not want to lose
#the first row of these targets which will happen as a result of first differencing, I will add back an extra row (week 2)
#before first differencing. This will result in 50 weeks worth of data like the other targets have.
y_protests_orig = event_types_by_week[['event_type_Protests']].copy()
y_vac_orig = event_types_by_week[['event_type_Violence against civilians']].copy()
print(y_protests_orig.shape)
print(y_vac_orig.shape)
(53, 1) (53, 1)
y_protests = y_protests_orig.iloc[2: , :]
y_vac = y_vac_orig.iloc[2: , :]
y_protests = y_protests.diff(periods=1).dropna(axis=0)
y_vac = y_vac.diff(periods=1).dropna(axis=0)
print(y_protests.shape)
print(y_vac.shape)
(50, 1) (50, 1)
'''
pickle.dump(y_protests, open(export_path + "y_protests_diff.pkl", "wb"))
pickle.dump(y_vac, open(export_path + "y_vac_diff.pkl", "wb"))
'''
y_protests = pickle.load(open(export_path + "y_protests_diff.pkl", "rb"))
y_vac = pickle.load(open(export_path + "y_vac_diff.pkl", "rb"))
#The following code for building a function that creates lagged features and includes them in the same dataset
#as the non-lagged features originates from the following website:
#https://stackoverflow.com/questions/20410312/how-to-create-a-lagged-data-structure-using-pandas-dataframe
def buildLaggedFeatures(s,lag=1,dropna=True):
if type(s) is pd.DataFrame:
new_dict={}
for col_name in s:
new_dict[col_name]=s[col_name]
# create lagged Series
for l in range(1,lag+1):
new_dict['%s_lag%d' %(col_name,l)]=s[col_name].shift(l)
res=pd.DataFrame(new_dict,index=s.index)
elif type(s) is pd.Series:
the_range=range(lag+1)
res=pd.concat([s.shift(i) for i in the_range],axis=1)
res.columns=['lag_%d' %i for i in the_range]
else:
print('Only works for DataFrame or Series')
return None
if dropna:
return res.dropna()
else:
return res
news_df = buildLaggedFeatures(news_tf_idf_sw_fd, lag=1, dropna=True)
#So now weeks 1, 2 and 3 are no longer in the dataset.
news_df.shape
(50, 73658)
X_train, X_test= np.split(news_df, [int(.80 *len(news_df))])
X_train.shape
(40, 73658)
X_test.shape
(10, 73658)
y_battles_train, y_battles_test= np.split(y_battles, [int(.80 *len(y_battles))])
y_explosions_train, y_explosions_test= np.split(y_explosions, [int(.80 *len(y_explosions))])
y_protests_train, y_protests_test= np.split(y_protests, [int(.80 *len(y_protests))])
y_riots_train, y_riots_test= np.split(y_riots, [int(.80 *len(y_riots))])
y_vac_train, y_vac_test= np.split(y_vac, [int(.80 *len(y_vac))])
ridge_pipe = make_pipeline(Ridge(random_state=42, max_iter=100000))
print(ridge_pipe.steps)
[('ridge', Ridge(max_iter=100000, random_state=42))]
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'ridge__alpha': np.logspace(-3, 0.5, 200)}
grid = GridSearchCV(ridge_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_battles_train)
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))
{'ridge__alpha': 2.5826187606826747} Mean RMSE when Optimal Model is Fit to Cross Validation Data: -4.8603
print(grid.score(X_test, y_battles_test))
-5.587540975032525
ridge_best = Ridge(alpha=2.5826187606826747, random_state=42, max_iter=100000).fit(X_train, y_battles_train)
y_battles_preds_ridge = ridge_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_battles_test, y_battles_preds_ridge))))
RMSE when Optimal Model is Fit to the Test Set: 5.5875
'''
pickle.dump(ridge_best, open(export_path + "ridge_best_y_battles.pkl", "wb"))
'''
ridge_best = pickle.load(open(export_path + "ridge_best_y_battles.pkl", "rb"))
lasso_pipe = make_pipeline(Lasso(random_state=42, max_iter=100000))
print(lasso_pipe.steps)
[('lasso', Lasso(max_iter=100000, random_state=42))]
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'lasso__alpha': np.logspace(-3, 0.5, 200)}
grid = GridSearchCV(lasso_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_battles_train)
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))
{'lasso__alpha': 0.3696912707195028} Mean RMSE when Optimal Model is Fit to Cross Validation Data: -4.8892
lasso_best = Lasso(alpha=0.3696912707195028, random_state=42, max_iter=100000).fit(X_train, y_battles_train)
y_battles_preds_lasso = lasso_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_battles_test, y_battles_preds_lasso))))
RMSE when Optimal Model is Fit to the Test Set: 5.6268
'''
pickle.dump(lasso_best, open(export_path + "lasso_best_y_battles.pkl", "wb"))
'''
lasso_best = pickle.load(open(export_path + "lasso_best_y_battles.pkl", "rb"))
rf_pipe = make_pipeline(RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10, min_samples_leaf=1, max_features="sqrt"))
print(rf_pipe.steps)
[('randomforestregressor', RandomForestRegressor(max_depth=10, max_features='sqrt', random_state=42))]
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'randomforestregressor__n_estimators': [1000, 2000, 3000, 4000],
'randomforestregressor__max_depth': [1, 10, 20, 30],
'randomforestregressor__min_samples_leaf': [1, 5, 10],
'randomforestregressor__max_features': [170, 220, 270, 320, 370]}
grid = GridSearchCV(rf_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_battles_train.values.ravel())
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))
{'randomforestregressor__max_depth': 10, 'randomforestregressor__max_features': 220, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__n_estimators': 3000} Mean RMSE when Optimal Model is Fit to Cross Validation Data: -4.6324
rf_best = RandomForestRegressor(random_state=42, n_estimators=3000, max_depth=10, min_samples_leaf=1, max_features=220, n_jobs=-1).fit(X_train, y_battles_train.values.ravel())
y_battles_preds_rf = rf_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_battles_test, y_battles_preds_rf))))
RMSE when Optimal Model is Fit to the Test Set: 5.5692
'''
pickle.dump(rf_best, open(export_path + "rf_best_y_battles.pkl", "wb"))
'''
rf_best = pickle.load(open(export_path + "rf_best_y_battles.pkl", "rb"))
xgb_pipe = make_pipeline(XGBRegressor(random_state=42, objective ='reg:squarederror', n_jobs=-1))
print(xgb_pipe.steps)
[('xgbregressor', XGBRegressor(base_score=None, booster=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, gamma=None, gpu_id=None, importance_type='gain', interaction_constraints=None, learning_rate=None, max_delta_step=None, max_depth=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=-1, num_parallel_tree=None, objective='reg:linear', random_state=42, reg_alpha=None, reg_lambda=None, scale_pos_weight=None, subsample=None, tree_method=None, validate_parameters=None, verbosity=None))]
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'xgbregressor__n_estimators': [50, 100, 150],
'xgbregressor__learning_rate': [0.05, 0.1, 0.15, 0.2],
'xgbregressor__max_depth': [4, 6, 8],
'xgbregressor__min_child_weight': [1, 3, 5, 7]
}
grid = GridSearchCV(xgb_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_battles_train.values.ravel())
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))
[22:24:20] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/regression_obj.cu:171: reg:linear is now deprecated in favor of reg:squarederror. {'xgbregressor__learning_rate': 0.05, 'xgbregressor__max_depth': 4, 'xgbregressor__min_child_weight': 7, 'xgbregressor__n_estimators': 50} Mean RMSE when Optimal Model is Fit to Cross Validation Data: -4.1704
xgb_best = XGBRegressor(random_state=42, n_estimators=50, learning_rate=0.05, max_depth=4, min_child_weight=7, n_jobs=-1).fit(X_train, y_battles_train.values.ravel())
y_battles_preds_rf = xgb_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_battles_test, y_battles_preds_rf))))
RMSE when Optimal Model is Fit to the Test Set: 5.4413
'''
pickle.dump(xgb_best, open(export_path + "xgb_best_y_battles.pkl", "wb"))
'''
pickle.dump(xgb_best, open(export_path + "xgb_best_y_battles.pkl", "wb"))
Comparison of Performance for Models on Cross Validation Data Using y_battles_train and X_train:
RMSE for:
ridge_pipe = make_pipeline(Ridge(random_state=42, max_iter=100000))
print(ridge_pipe.steps)
[('ridge', Ridge(max_iter=100000, random_state=42))]
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'ridge__alpha': np.logspace(-3, 0.5, 200)}
grid = GridSearchCV(ridge_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_explosions_train)
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))
{'ridge__alpha': 3.1622776601683795} Mean RMSE when Optimal Model is Fit to Cross Validation Data: -1.5569
ridge_best = Ridge(alpha= 3.1622776601683795, random_state=42, max_iter=100000).fit(X_train, y_explosions_train)
y_explosions_preds_ridge = ridge_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_explosions_test, y_explosions_preds_ridge))))
RMSE when Optimal Model is Fit to the Test Set: 1.7543
'''
pickle.dump(ridge_best, open(export_path + "ridge_best_y_explosions.pkl", "wb"))
'''
ridge_best = pickle.load(open(export_path + "ridge_best_y_explosions.pkl", "rb"))
lasso_pipe = make_pipeline(Lasso(random_state=42, max_iter=100000))
print(lasso_pipe.steps)
[('lasso', Lasso(max_iter=100000, random_state=42))]
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'lasso__alpha': np.logspace(-3, 0.5, 200)}
grid = GridSearchCV(lasso_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_explosions_train)
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))
{'lasso__alpha': 0.07316807143427195} Mean RMSE when Optimal Model is Fit to Cross Validation Data: -1.5465
lasso_best = Lasso(alpha= 0.07316807143427195, random_state=42, max_iter=100000).fit(X_train, y_explosions_train)
y_explosions_preds_lasso = lasso_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_explosions_test, y_explosions_preds_lasso))))
RMSE when Optimal Model is Fit to the Test Set: 1.7357
'''
pickle.dump(lasso_best, open(export_path + "lasso_best_y_explosions.pkl", "wb"))
'''
pickle.dump(lasso_best, open(export_path + "lasso_best_y_explosions.pkl", "wb"))
from sklearn.ensemble import RandomForestRegressor
rf_pipe = make_pipeline(RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10, min_samples_leaf=1, max_features="sqrt"))
print(rf_pipe.steps)
[('randomforestregressor', RandomForestRegressor(max_depth=10, max_features='sqrt', random_state=42))]
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'randomforestregressor__n_estimators': [1000, 2000, 3000, 4000],
'randomforestregressor__max_depth': [1, 10, 20, 30],
'randomforestregressor__min_samples_leaf': [1, 5, 10],
'randomforestregressor__max_features': [170, 220, 270, 320, 370]}
grid = GridSearchCV(rf_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_explosions_train.values.ravel())
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))
{'randomforestregressor__max_depth': 10, 'randomforestregressor__max_features': 320, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__n_estimators': 3000} Mean RMSE when Optimal Model is Fit to Cross Validation Data: -1.5342
rf_best = RandomForestRegressor(random_state=42, n_estimators=3000, max_depth=10, min_samples_leaf=1, max_features=320, n_jobs=-1).fit(X_train, y_explosions_train.values.ravel())
y_explosions_preds_rf = rf_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_explosions_test, y_explosions_preds_rf))))
RMSE when Optimal Model is Fit to the Test Set: 1.7481
'''
pickle.dump(rf_best, open(export_path + "rf_best_y_explosions.pkl", "wb"))
'''
rf_best = pickle.load(open(export_path + rf_best_y_explosions.pkl", "rb"))
xgb_pipe = make_pipeline(XGBRegressor(random_state=42, objective ='reg:squarederror', n_jobs=-1))
print(xgb_pipe.steps)
[('xgbregressor', XGBRegressor(base_score=None, booster=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, gamma=None, gpu_id=None, importance_type='gain', interaction_constraints=None, learning_rate=None, max_delta_step=None, max_depth=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=-1, num_parallel_tree=None, random_state=42, reg_alpha=None, reg_lambda=None, scale_pos_weight=None, subsample=None, tree_method=None, validate_parameters=None, verbosity=None))]
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'xgbregressor__n_estimators': [50, 100, 150],
'xgbregressor__learning_rate': [0.05, 0.1, 0.15, 0.2],
'xgbregressor__max_depth': [4, 6, 8],
'xgbregressor__min_child_weight': [1, 3, 5, 7]
}
grid = GridSearchCV(xgb_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_explosions_train.values.ravel())
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))
{'xgbregressor__learning_rate': 0.2, 'xgbregressor__max_depth': 4, 'xgbregressor__min_child_weight': 7, 'xgbregressor__n_estimators': 100} Mean RMSE when Optimal Model is Fit to Cross Validation Data: -1.4564
xgb_best = XGBRegressor(random_state=42, n_estimators=100, learning_rate=0.2, max_depth=4, min_child_weight=7, n_jobs=-1).fit(X_train, y_explosions_train.values.ravel())
y_explosions_preds_rf = xgb_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_explosions_test, y_explosions_preds_rf))))
RMSE when Optimal Model is Fit to the Test Set: 1.7689
'''
pickle.dump(xgb_best, open(export_path + "xgb_best_y_explosions.pkl", "wb"))
'''
pickle.dump(xgb_best, open(export_path + "xgb_best_y_explosions.pkl", "wb"))
Comparison of Performance for Models on Cross Validation Data Using y_explosions_train and X_train:
RMSE for:
ridge_pipe = make_pipeline(Ridge(random_state=42, max_iter=100000))
print(ridge_pipe.steps)
[('ridge', Ridge(max_iter=100000, random_state=42))]
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'ridge__alpha': np.logspace(-3, 0.5, 200)}
grid = GridSearchCV(ridge_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_protests_train)
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))
{'ridge__alpha': 3.1622776601683795} Mean RMSE when Optimal Model is Fit to Cross Validation Data: -108.2137
ridge_best = Ridge(alpha= 3.1622776601683795, random_state=42, max_iter=100000).fit(X_train, y_protests_train)
y_protests_preds_ridge = ridge_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_protests_test, y_protests_preds_ridge))))
RMSE when Optimal Model is Fit to the Test Set: 116.6480
'''
pickle.dump(ridge_best, open(export_path + "ridge_best_y_protests.pkl", "wb"))
'''
ridge_best = pickle.load(open(export_path + "ridge_best_y_protests.pkl", "rb"))
lasso_pipe = make_pipeline(Lasso(random_state=42, max_iter=100000))
print(lasso_pipe.steps)
[('lasso', Lasso(max_iter=100000, random_state=42))]
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'lasso__alpha': np.logspace(-3, 0.5, 200)}
grid = GridSearchCV(lasso_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_protests_train)
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))
{'lasso__alpha': 1.7225859653987856} Mean RMSE when Optimal Model is Fit to Cross Validation Data: -107.6887
lasso_best = Lasso(alpha=1.7225859653987856, random_state=42, max_iter=100000).fit(X_train, y_protests_train)
y_protests_preds_lasso = lasso_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_protests_test, y_protests_preds_lasso))))
RMSE when Optimal Model is Fit to the Test Set: 115.6709
'''
pickle.dump(lasso_best, open(export_path + "lasso_best_y_protests.pkl", "wb"))
'''
lasso_best = pickle.load(open(export_path + "lasso_best_y_protests.pkl", "rb"))
from sklearn.ensemble import RandomForestRegressor
rf_pipe = make_pipeline(RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10, min_samples_leaf=1, max_features="sqrt"))
print(rf_pipe.steps)
[('randomforestregressor', RandomForestRegressor(max_depth=10, max_features='sqrt', random_state=42))]
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'randomforestregressor__n_estimators': [1000, 2000, 3000, 4000],
'randomforestregressor__max_depth': [1, 10, 20, 30],
'randomforestregressor__min_samples_leaf': [1, 5, 10],
'randomforestregressor__max_features': [170, 220, 270, 320, 370]}
grid = GridSearchCV(rf_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_protests_train.values.ravel())
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))
{'randomforestregressor__max_depth': 1, 'randomforestregressor__max_features': 320, 'randomforestregressor__min_samples_leaf': 10, 'randomforestregressor__n_estimators': 2000} Mean RMSE when Optimal Model is Fit to Cross Validation Data: -107.4333
rf_best = RandomForestRegressor(random_state=42, n_estimators=2000, max_depth=1, min_samples_leaf=10, max_features=320, n_jobs=-1).fit(X_train, y_protests_train.values.ravel())
y_protests_preds_rf = rf_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_protests_test, y_protests_preds_rf))))
RMSE when Optimal Model is Fit to the Test Set: 117.9724
'''
pickle.dump(rf_best, open(export_path + "rf_best_y_protests.pkl", "wb"))
'''
rf_best = pickle.load(open(export_path + "rf_best_y_protests.pkl", "rb"))
xgb_pipe = make_pipeline(XGBRegressor(random_state=42, objective ='reg:squarederror', n_jobs=-1))
print(xgb_pipe.steps)
[('xgbregressor', XGBRegressor(base_score=None, booster=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, gamma=None, gpu_id=None, importance_type='gain', interaction_constraints=None, learning_rate=None, max_delta_step=None, max_depth=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=-1, num_parallel_tree=None, random_state=42, reg_alpha=None, reg_lambda=None, scale_pos_weight=None, subsample=None, tree_method=None, validate_parameters=None, verbosity=None))]
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'xgbregressor__n_estimators': [50, 100, 150],
'xgbregressor__learning_rate': [0.05, 0.1, 0.15, 0.2],
'xgbregressor__max_depth': [4, 6, 8],
'xgbregressor__min_child_weight': [3, 5, 7]
}
grid = GridSearchCV(xgb_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_protests_train.values.ravel())
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))
{'xgbregressor__learning_rate': 0.05, 'xgbregressor__max_depth': 4, 'xgbregressor__min_child_weight': 7, 'xgbregressor__n_estimators': 50} Mean RMSE when Optimal Model is Fit to Cross Validation Data: -116.9903
xgb_best = XGBRegressor(random_state=42, n_estimators= 50, learning_rate=0.05, max_depth=4, min_child_weight=7, n_jobs=-1).fit(X_train, y_protests_train.values.ravel())
y_protests_preds_rf = xgb_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_protests_test, y_protests_preds_rf))))
RMSE when Optimal Model is Fit to the Test Set: 124.2078
'''
pickle.dump(xgb_best, open(export_path + "xgb_best_y_protests.pkl", "wb"))
'''
pickle.dump(xgb_best, open(export_path + "xgb_best_y_protests.pkl", "wb"))
Comparison of Performance for Models on Cross Validation Data Using y_protests_train and X_train:
RMSE for:
ridge_pipe = make_pipeline(Ridge(random_state=42, max_iter=100000))
print(ridge_pipe.steps)
[('ridge', Ridge(max_iter=100000, random_state=42))]
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'ridge__alpha': np.logspace(-3, 0.5, 200)}
grid = GridSearchCV(ridge_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_riots_train)
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))
{'ridge__alpha': 3.1622776601683795} Mean RMSE when Optimal Model is Fit to Cross Validation Data: -2.5179
ridge_best = Ridge(alpha= 3.1622776601683795, random_state=42, max_iter=100000).fit(X_train, y_riots_train)
y_riots_preds_ridge = ridge_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_riots_test, y_riots_preds_ridge))))
RMSE when Optimal Model is Fit to the Test Set: 2.2682
'''
pickle.dump(ridge_best, open(export_path + "ridge_best_y_riots.pkl", "wb"))
'''
ridge_best = pickle.load(open(export_path + "ridge_best_y_riots.pkl", "rb"))
lasso_pipe = make_pipeline(Lasso(random_state=42, max_iter=100000))
print(lasso_pipe.steps)
[('lasso', Lasso(max_iter=100000, random_state=42))]
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'lasso__alpha': np.logspace(-3, 0.5, 200)}
grid = GridSearchCV(lasso_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_riots_train)
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))
{'lasso__alpha': 0.07934096665797492} Mean RMSE when Optimal Model is Fit to Cross Validation Data: -2.4961
lasso_best = Lasso(alpha=0.07934096665797492, random_state=42, max_iter=100000).fit(X_train, y_riots_train)
y_riots_preds_lasso = lasso_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_riots_test, y_riots_preds_lasso))))
RMSE when Optimal Model is Fit to the Test Set: 2.2411
'''
pickle.dump(lasso_best, open(export_path + "lasso_best_y_riots.pkl", "wb"))
'''
lasso_best = pickle.load(open(export_path + "lasso_best_y_riots.pkl", "rb"))
from sklearn.ensemble import RandomForestRegressor
rf_pipe = make_pipeline(RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10, min_samples_leaf=1, max_features="sqrt"))
print(rf_pipe.steps)
[('randomforestregressor', RandomForestRegressor(max_depth=10, max_features='sqrt', random_state=42))]
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'randomforestregressor__n_estimators': [1000, 2000, 3000, 4000],
'randomforestregressor__max_depth': [1, 10, 20, 30],
'randomforestregressor__min_samples_leaf': [1, 5, 10],
'randomforestregressor__max_features': [170, 220, 270, 320, 370]}
grid = GridSearchCV(rf_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_riots_train.values.ravel())
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))
{'randomforestregressor__max_depth': 1, 'randomforestregressor__max_features': 370, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__n_estimators': 2000} Mean RMSE when Optimal Model is Fit to Cross Validation Data: -2.3903
rf_best = RandomForestRegressor(random_state=42, n_estimators=2000, max_depth=1, min_samples_leaf=1, max_features=370, n_jobs=-1).fit(X_train, y_riots_train.values.ravel())
y_riots_preds_rf = rf_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_riots_test, y_riots_preds_rf))))
RMSE when Optimal Model is Fit to the Test Set: 2.1061
'''
pickle.dump(rf_best, open(export_path + "rf_best_y_riots.pkl", "wb"))
'''
rf_best = pickle.load(open(export_path + "rf_best_y_riots.pkl", "rb"))
xgb_pipe = make_pipeline(XGBRegressor(random_state=42, objective ='reg:squarederror', n_jobs=-1))
print(xgb_pipe.steps)
[('xgbregressor', XGBRegressor(base_score=None, booster=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, gamma=None, gpu_id=None, importance_type='gain', interaction_constraints=None, learning_rate=None, max_delta_step=None, max_depth=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=-1, num_parallel_tree=None, random_state=42, reg_alpha=None, reg_lambda=None, scale_pos_weight=None, subsample=None, tree_method=None, validate_parameters=None, verbosity=None))]
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'xgbregressor__n_estimators': [50, 100, 150],
'xgbregressor__learning_rate': [0.05, 0.1, 0.15, 0.2],
'xgbregressor__max_depth': [4, 6, 8],
'xgbregressor__min_child_weight': [3, 5, 7]
}
grid = GridSearchCV(xgb_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_riots_train.values.ravel())
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))
{'xgbregressor__learning_rate': 0.05, 'xgbregressor__max_depth': 4, 'xgbregressor__min_child_weight': 5, 'xgbregressor__n_estimators': 50} Mean RMSE when Optimal Model is Fit to Cross Validation Data: -2.1890
xgb_best = XGBRegressor(random_state=42, n_estimators=50, learning_rate=0.05, max_depth=4, min_child_weight=5, n_jobs=-1).fit(X_train, y_riots_train.values.ravel())
y_riots_preds_rf = xgb_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_riots_test, y_riots_preds_rf))))
RMSE when Optimal Model is Fit to the Test Set: 1.9828
'''
pickle.dump(xgb_best, open(export_path + "xgb_best_y_riots.pkl", "wb"))
'''
pickle.dump(xgb_best, open(export_path + "xgb_best_y_riots.pkl", "wb"))
Comparison of Performance for Models on Cross Validation Data Using y_riots_train and X_train:
RMSE for:
ridge_pipe = make_pipeline(Ridge(random_state=42, max_iter=100000))
print(ridge_pipe.steps)
[('ridge', Ridge(max_iter=100000, random_state=42))]
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'ridge__alpha': np.logspace(-3, 0.5, 200)}
grid = GridSearchCV(ridge_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_vac_train)
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))
{'ridge__alpha': 0.02171117945694504} Mean RMSE when Optimal Model is Fit to Cross Validation Data: -119.6204
ridge_best = Ridge(alpha=0.02171117945694504, random_state=42, max_iter=100000).fit(X_train, y_vac_train)
y_vac_preds_ridge = ridge_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_vac_test, y_vac_preds_ridge))))
RMSE when Optimal Model is Fit to the Test Set: 157.2134
'''
pickle.dump(ridge_best, open(export_path + "ridge_best_y_vac.pkl", "wb"))
'''
ridge_best = pickle.load(open(export_path + "ridge_best_y_vac.pkl", "rb"))
lasso_pipe = make_pipeline(Lasso(random_state=42, max_iter=100000))
print(lasso_pipe.steps)
[('lasso', Lasso(max_iter=100000, random_state=42))]
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'lasso__alpha': np.logspace(-3, 0.5, 200)}
grid = GridSearchCV(lasso_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_vac_train)
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))
{'lasso__alpha': 0.16446761779946645} Mean RMSE when Optimal Model is Fit to Cross Validation Data: -118.2845
lasso_best = Lasso(alpha=0.16446761779946645, random_state=42, max_iter=100000).fit(X_train, y_vac_train)
y_vac_preds_lasso = lasso_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_vac_test, y_vac_preds_lasso))))
RMSE when Optimal Model is Fit to the Test Set: 159.3633
'''
pickle.dump(lasso_best, open(export_path + "lasso_best_y_vac.pkl", "wb"))
'''
lasso_best = pickle.load(open(export_path + "lasso_best_y_vac.pkl", "rb"))
from sklearn.ensemble import RandomForestRegressor
rf_pipe = make_pipeline(RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10, min_samples_leaf=1, max_features="sqrt"))
print(rf_pipe.steps)
[('randomforestregressor', RandomForestRegressor(max_depth=10, max_features='sqrt', random_state=42))]
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'randomforestregressor__n_estimators': [1000, 2000, 3000, 4000],
'randomforestregressor__max_depth': [1, 10, 20, 30],
'randomforestregressor__min_samples_leaf': [1, 5, 10],
'randomforestregressor__max_features': [170, 220, 270, 320, 370]}
grid = GridSearchCV(rf_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_vac_train.values.ravel())
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))
{'randomforestregressor__max_depth': 10, 'randomforestregressor__max_features': 270, 'randomforestregressor__min_samples_leaf': 5, 'randomforestregressor__n_estimators': 2000} Mean RMSE when Optimal Model is Fit to Cross Validation Data: -127.2781
rf_best = RandomForestRegressor(random_state=42, n_estimators=2000, max_depth=10, min_samples_leaf=5, max_features=270, n_jobs=-1).fit(X_train, y_vac_train.values.ravel())
y_vac_preds_rf = rf_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_vac_test, y_vac_preds_rf))))
RMSE when Optimal Model is Fit to the Test Set: 121.5632
'''
pickle.dump(rf_best, open(export_path + "rf_best_y_vac.pkl", "wb"))
'''
rf_best = pickle.load(open(export_path + "rf_best_y_vac.pkl", "rb"))
xgb_pipe = make_pipeline(XGBRegressor(random_state=42, objective ='reg:squarederror', n_jobs=-1))
print(xgb_pipe.steps)
[('xgbregressor', XGBRegressor(base_score=None, booster=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, gamma=None, gpu_id=None, importance_type='gain', interaction_constraints=None, learning_rate=None, max_delta_step=None, max_depth=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=-1, num_parallel_tree=None, random_state=42, reg_alpha=None, reg_lambda=None, scale_pos_weight=None, subsample=None, tree_method=None, validate_parameters=None, verbosity=None))]
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {'xgbregressor__n_estimators': [50, 100, 150],
'xgbregressor__learning_rate': [0.05, 0.1, 0.15, 0.2],
'xgbregressor__max_depth': [4, 6, 8],
'xgbregressor__min_child_weight': [3, 5, 7]
}
grid = GridSearchCV(xgb_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_vac_train.values.ravel())
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))
{'xgbregressor__learning_rate': 0.15, 'xgbregressor__max_depth': 4, 'xgbregressor__min_child_weight': 7, 'xgbregressor__n_estimators': 150} Mean RMSE when Optimal Model is Fit to Cross Validation Data: -123.4189
xgb_best = XGBRegressor(random_state=42, n_estimators=150, learning_rate=0.15, max_depth=4, min_child_weight=7, n_jobs=-1).fit(X_train, y_vac_train.values.ravel())
y_vac_preds_xgb = xgb_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_vac_test, y_vac_preds_xgb))))
RMSE when Optimal Model is Fit to the Test Set: 158.5494
'''
pickle.dump(xgb_best, open(export_path + "xgb_best_y_vac.pkl", "wb"))
'''
pickle.dump(xgb_best, open(export_path + "xgb_best_y_vac.pkl", "wb"))
Comparison of Performance for Models on Cross Validation Data Using y_vac_train and X_train:
RMSE for: