import numpy as np
import scipy as sp
import pandas as pd
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from xgboost import XGBRegressor
from pandas.core.common import random_state
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
import pickle


the_path = "C:/Users/rsb84/Desktop/RB/portfolio_data/nigeria_news/"
export_path = "C:/Users/rsb84/Desktop/RB/portfolio_data/nigeria_news/export/"


df = pd.read_csv('Nigeria2019_ACLED_Extract.csv')


event_types = df[['week', 'event_type', 'sub_event_type']].copy()


event_types.head()


event_types = event_types.loc[~(event_types['event_type'] == 'Strategic developments'),:]
event_types = event_types.loc[~(event_types['sub_event_type'] == 'Peaceful protest'),:]
event_types = event_types.loc[~(event_types['sub_event_type'] == 'Protest with intervention'),:]


event_types_dummies = pd.get_dummies(event_types[['event_type']])


event_types_dummies.head()


event_types_dummies.shape

(1492, 5)


event_types.shape

(1492, 3)


event_types_combined = pd.concat([event_types[['week']], event_types_dummies], axis=1)


event_types_combined.shape

(1492, 6)


event_types_combined.head()


event_types_by_week = event_types_combined.groupby(['week']).sum()


event_types_by_week.shape

(53, 5)


event_types_by_week.head()


event_types_by_week = event_types_by_week.reset_index()


event_types_by_week = pd.DataFrame(event_types_by_week)


newsfeed = pd.read_csv("newsfeed.csv")


newsfeed.columns

Index(['week', 'text'], dtype='object')


newsfeed = newsfeed.sort_values(by="week")


newsfeed.shape

(24355, 2)


newsfeed_by_week = newsfeed.groupby('week')['text'].apply(lambda x:x.str.cat(sep=" "))


newsfeed_by_week = pd.DataFrame(newsfeed_by_week)


newsfeed_by_week = newsfeed_by_week.reset_index()


newsfeed_by_week['week'].values

array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
       53], dtype=int64)


event_types_by_week['week'].values

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53], dtype=int64)


event_types_by_week_50_wks = event_types_by_week.iloc[3: , :] #This removes the first 3 rows (weeks 1 - 3) from the dataframe


event_types_by_week_50_wks.head()


event_types_by_week_50_wks.shape

(50, 6)


import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rsb84\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

True


def rem_sw(var):
    from nltk.corpus import stopwords
    sw = stopwords.words('english')
    sw = set(sw)
    new_sw = {"said", "says", "saying", "say", "us", "since", "like", "likes", "people"}
    sw = sw.union(new_sw)
    tmp = var.split() #tokenizinng words here
    fin_var = [word for word in tmp if word not in sw] #removing stopwords
    fin_var = ' '.join(fin_var) #rejoining tokenized words (without stopwords)
    return fin_var


newsfeed_by_week['text_lower'] = newsfeed_by_week.text.apply(lambda x: x.lower())
newsfeed_by_week['text_sw'] = newsfeed_by_week.text_lower.apply(rem_sw)


the_column1 = 'text_sw'


def my_tf_idf_sw(var, path_in):
    from sklearn.feature_extraction.text import TfidfVectorizer
    import pandas as pd
    import pickle
    my_tf_idf = TfidfVectorizer()
    my_tf_idf_vec = pd.DataFrame(my_tf_idf.fit_transform(var).toarray())
    my_tf_idf_vec.columns = my_tf_idf.get_feature_names_out()
    pickle.dump(my_tf_idf, open(path_in + "tf_idf_sw.pkl", "wb"))
    pickle.dump(my_tf_idf_vec, open(path_in + "tf_idf_df_sw.pkl", "wb" ) )
    return my_tf_idf_vec


news_tf_idf_sw = my_tf_idf_sw(newsfeed_by_week[the_column1], export_path)


news_tf_idf_sw_fd = news_tf_idf_sw.diff(periods=1)


news_tf_idf_sw_fd.shape

(52, 36829)


news_tf_idf_sw_fd = news_tf_idf_sw_fd.dropna(axis=0)


news_tf_idf_sw_fd.shape #Now week 2 has been deleted by dropping the first row of missing values left by first differencing

(51, 36829)


'''
pickle.dump(news_tf_idf_sw_fd, open(export_path + "news_tf_idf_sw_fd.pkl", "wb"))


'''
news_tf_idf_sw_fd = pickle.load(open(export_path + "news_tf_idf_sw_fd.pkl", "rb"))


#Here, I create the target variables I will use my newsfeed datasets to attempt to predict using supervised machine learning models

y_battles = event_types_by_week_50_wks[['event_type_Battles']].copy()
y_explosions = event_types_by_week_50_wks[['event_type_Explosions/Remote violence']].copy()
y_protests = event_types_by_week_50_wks[['event_type_Protests']].copy()
y_riots = event_types_by_week_50_wks[['event_type_Riots']].copy()
y_vac = event_types_by_week_50_wks[['event_type_Violence against civilians']].copy()


print(y_battles.shape)
print(y_explosions.shape)
print(y_protests.shape)
print(y_riots.shape)
print(y_vac.shape)

(50, 1)
(50, 1)
(50, 1)
(50, 1)
(50, 1)


'''
pickle.dump(y_battles, open(export_path + "y_battles_50_wks.pkl", "wb"))
pickle.dump(y_explosions, open(export_path + "y_explosions_50_wks.pkl", "wb"))
pickle.dump(y_protests, open(export_path + "y_protests_50_wks.pkl", "wb"))
pickle.dump(y_riots, open(export_path + "y_riots_50_wks.pkl", "wb"))
pickle.dump(y_vac, open(export_path + "y_vac_50_wks.pkl", "wb"))


'''
y_battle = pickle.load(open(export_path + "y_battles_50_wks.pkl", "rb"))
y_explosions = pickle.load(open(export_path + "y_explosions_50_wks.pkl", "rb"))
y_protests = pickle.load(open(export_path + "y_protests_50_wks.pkl", "rb"))
y_riots = pickle.load(open(export_path + "y_riots_50_wks.pkl", "rb"))
y_vac = pickle.load(open(export_path + "y_vac_50_wks.pkl", "rb"))


from statsmodels.tsa.stattools import adfuller
adf_test = adfuller(y_battles)
print("ADF = " + str(adf_test[0]))
print("p-value = " +str(adf_test[1]))

ADF = -8.038047205293735
p-value = 1.87939942870067e-12


adf_test = adfuller(y_explosions)
print("ADF = " + str(adf_test[0]))
print("p-value = " +str(adf_test[1]))

ADF = -7.011085577942736
p-value = 6.919047048561405e-10


adf_test = adfuller(y_protests)
print("ADF = " + str(adf_test[0]))
print("p-value = " +str(adf_test[1])) #We cannot reject the null hypothesis of a unit root and non-statonarity since the p-value > 0.05.

ADF = -2.166443849384345
p-value = 0.21864897167202985


adf_test = adfuller(y_riots)
print("ADF = " + str(adf_test[0]))
print("p-value = " +str(adf_test[1]))

ADF = -5.660105937292464
p-value = 9.422640208691123e-07


adf_test = adfuller(y_vac)
print("ADF = " + str(adf_test[0]))
print("p-value = " +str(adf_test[1])) #We cannot reject the null hypothesis of a unit root and non-statonarity since the p-value > 0.05.

ADF = -1.6687567391397395
p-value = 0.4472775450272684


#We need to first difference y_protests and y_vac to make them stationary, but since I do not want to lose 
#the first row of these targets which will happen as a result of first differencing, I will add back an extra row (week 2) 
#before first differencing. This will result in 50 weeks worth of data like the other targets have. 

y_protests_orig = event_types_by_week[['event_type_Protests']].copy()
y_vac_orig = event_types_by_week[['event_type_Violence against civilians']].copy()


print(y_protests_orig.shape)
print(y_vac_orig.shape)

(53, 1)
(53, 1)


y_protests = y_protests_orig.iloc[2: , :]
y_vac = y_vac_orig.iloc[2: , :]


y_protests = y_protests.diff(periods=1).dropna(axis=0)
y_vac = y_vac.diff(periods=1).dropna(axis=0)


print(y_protests.shape)
print(y_vac.shape)

(50, 1)
(50, 1)


'''
pickle.dump(y_protests, open(export_path + "y_protests_diff.pkl", "wb"))
pickle.dump(y_vac, open(export_path + "y_vac_diff.pkl", "wb"))


'''
y_protests = pickle.load(open(export_path + "y_protests_diff.pkl", "rb"))
y_vac = pickle.load(open(export_path + "y_vac_diff.pkl", "rb"))


#The following code for building a function that creates lagged features and includes them in the same dataset 
#as the non-lagged features originates from the following website:
#https://stackoverflow.com/questions/20410312/how-to-create-a-lagged-data-structure-using-pandas-dataframe

def buildLaggedFeatures(s,lag=1,dropna=True):

    if type(s) is pd.DataFrame:
        new_dict={}
        for col_name in s:
            new_dict[col_name]=s[col_name]
            # create lagged Series
            for l in range(1,lag+1):
                new_dict['%s_lag%d' %(col_name,l)]=s[col_name].shift(l)
        res=pd.DataFrame(new_dict,index=s.index)

    elif type(s) is pd.Series:
        the_range=range(lag+1)
        res=pd.concat([s.shift(i) for i in the_range],axis=1)
        res.columns=['lag_%d' %i for i in the_range]
    else:
        print('Only works for DataFrame or Series')
        return None
    if dropna:
        return res.dropna()
    else:
        return res


news_df = buildLaggedFeatures(news_tf_idf_sw_fd, lag=1, dropna=True)
#So now weeks 1, 2 and 3 are no longer in the dataset.


news_df.shape

(50, 73658)


X_train, X_test= np.split(news_df, [int(.80 *len(news_df))])


X_train.shape

(40, 73658)


X_test.shape

(10, 73658)


y_battles_train, y_battles_test= np.split(y_battles, [int(.80 *len(y_battles))])


y_explosions_train, y_explosions_test= np.split(y_explosions, [int(.80 *len(y_explosions))])


y_protests_train, y_protests_test= np.split(y_protests, [int(.80 *len(y_protests))])


y_riots_train, y_riots_test= np.split(y_riots, [int(.80 *len(y_riots))])


y_vac_train, y_vac_test= np.split(y_vac, [int(.80 *len(y_vac))])


ridge_pipe = make_pipeline(Ridge(random_state=42, max_iter=100000))
print(ridge_pipe.steps)

[('ridge', Ridge(max_iter=100000, random_state=42))]


tscv = TimeSeriesSplit(n_splits=5)

param_grid = {'ridge__alpha': np.logspace(-3, 0.5, 200)}
grid = GridSearchCV(ridge_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_battles_train)
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))

{'ridge__alpha': 2.5826187606826747}
Mean RMSE when Optimal Model is Fit to Cross Validation Data: -4.8603


print(grid.score(X_test, y_battles_test))

-5.587540975032525


ridge_best = Ridge(alpha=2.5826187606826747, random_state=42, max_iter=100000).fit(X_train, y_battles_train)
y_battles_preds_ridge = ridge_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_battles_test, y_battles_preds_ridge))))

RMSE when Optimal Model is Fit to the Test Set: 5.5875


'''
pickle.dump(ridge_best, open(export_path + "ridge_best_y_battles.pkl", "wb"))


'''
ridge_best = pickle.load(open(export_path + "ridge_best_y_battles.pkl", "rb"))


lasso_pipe = make_pipeline(Lasso(random_state=42, max_iter=100000))
print(lasso_pipe.steps)

[('lasso', Lasso(max_iter=100000, random_state=42))]


tscv = TimeSeriesSplit(n_splits=5)

param_grid = {'lasso__alpha': np.logspace(-3, 0.5, 200)}
grid = GridSearchCV(lasso_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_battles_train)
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))

{'lasso__alpha': 0.3696912707195028}
Mean RMSE when Optimal Model is Fit to Cross Validation Data: -4.8892


lasso_best = Lasso(alpha=0.3696912707195028, random_state=42, max_iter=100000).fit(X_train, y_battles_train)
y_battles_preds_lasso = lasso_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_battles_test, y_battles_preds_lasso))))

RMSE when Optimal Model is Fit to the Test Set: 5.6268


'''
pickle.dump(lasso_best, open(export_path + "lasso_best_y_battles.pkl", "wb"))


'''
lasso_best = pickle.load(open(export_path + "lasso_best_y_battles.pkl", "rb"))


rf_pipe = make_pipeline(RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10, min_samples_leaf=1, max_features="sqrt"))
print(rf_pipe.steps)

[('randomforestregressor', RandomForestRegressor(max_depth=10, max_features='sqrt', random_state=42))]


tscv = TimeSeriesSplit(n_splits=5)

param_grid = {'randomforestregressor__n_estimators': [1000, 2000, 3000, 4000],
             'randomforestregressor__max_depth': [1, 10, 20, 30],
             'randomforestregressor__min_samples_leaf': [1, 5, 10],
             'randomforestregressor__max_features': [170, 220, 270, 320, 370]}
grid = GridSearchCV(rf_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_battles_train.values.ravel())
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))

{'randomforestregressor__max_depth': 10, 'randomforestregressor__max_features': 220, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__n_estimators': 3000}
Mean RMSE when Optimal Model is Fit to Cross Validation Data: -4.6324


rf_best = RandomForestRegressor(random_state=42, n_estimators=3000, max_depth=10, min_samples_leaf=1, max_features=220, n_jobs=-1).fit(X_train, y_battles_train.values.ravel())
y_battles_preds_rf = rf_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_battles_test, y_battles_preds_rf))))

RMSE when Optimal Model is Fit to the Test Set: 5.5692


'''
pickle.dump(rf_best, open(export_path + "rf_best_y_battles.pkl", "wb"))


'''
rf_best = pickle.load(open(export_path + "rf_best_y_battles.pkl", "rb"))


xgb_pipe = make_pipeline(XGBRegressor(random_state=42, objective ='reg:squarederror', n_jobs=-1))
print(xgb_pipe.steps)

[('xgbregressor', XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             objective='reg:linear', random_state=42, reg_alpha=None,
             reg_lambda=None, scale_pos_weight=None, subsample=None,
             tree_method=None, validate_parameters=None, verbosity=None))]


tscv = TimeSeriesSplit(n_splits=5)

param_grid = {'xgbregressor__n_estimators': [50, 100, 150],
             'xgbregressor__learning_rate': [0.05, 0.1, 0.15, 0.2],
             'xgbregressor__max_depth': [4, 6, 8],
             'xgbregressor__min_child_weight': [1, 3, 5, 7]
             }
grid = GridSearchCV(xgb_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_battles_train.values.ravel())
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))

[22:24:20] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/regression_obj.cu:171: reg:linear is now deprecated in favor of reg:squarederror.
{'xgbregressor__learning_rate': 0.05, 'xgbregressor__max_depth': 4, 'xgbregressor__min_child_weight': 7, 'xgbregressor__n_estimators': 50}
Mean RMSE when Optimal Model is Fit to Cross Validation Data: -4.1704


xgb_best = XGBRegressor(random_state=42, n_estimators=50, learning_rate=0.05, max_depth=4, min_child_weight=7, n_jobs=-1).fit(X_train, y_battles_train.values.ravel())
y_battles_preds_rf = xgb_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_battles_test, y_battles_preds_rf))))

RMSE when Optimal Model is Fit to the Test Set: 5.4413


'''
pickle.dump(xgb_best, open(export_path + "xgb_best_y_battles.pkl", "wb"))


'''
pickle.dump(xgb_best, open(export_path + "xgb_best_y_battles.pkl", "wb"))


ridge_pipe = make_pipeline(Ridge(random_state=42, max_iter=100000))
print(ridge_pipe.steps)

[('ridge', Ridge(max_iter=100000, random_state=42))]


tscv = TimeSeriesSplit(n_splits=5)

param_grid = {'ridge__alpha': np.logspace(-3, 0.5, 200)}
grid = GridSearchCV(ridge_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_explosions_train)
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))

{'ridge__alpha': 3.1622776601683795}
Mean RMSE when Optimal Model is Fit to Cross Validation Data: -1.5569


ridge_best = Ridge(alpha= 3.1622776601683795, random_state=42, max_iter=100000).fit(X_train, y_explosions_train)
y_explosions_preds_ridge = ridge_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_explosions_test, y_explosions_preds_ridge))))

RMSE when Optimal Model is Fit to the Test Set: 1.7543


'''
pickle.dump(ridge_best, open(export_path + "ridge_best_y_explosions.pkl", "wb"))


'''
ridge_best = pickle.load(open(export_path + "ridge_best_y_explosions.pkl", "rb"))


lasso_pipe = make_pipeline(Lasso(random_state=42, max_iter=100000))
print(lasso_pipe.steps)

[('lasso', Lasso(max_iter=100000, random_state=42))]


tscv = TimeSeriesSplit(n_splits=5)

param_grid = {'lasso__alpha': np.logspace(-3, 0.5, 200)}
grid = GridSearchCV(lasso_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_explosions_train)
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))

{'lasso__alpha': 0.07316807143427195}
Mean RMSE when Optimal Model is Fit to Cross Validation Data: -1.5465


lasso_best = Lasso(alpha= 0.07316807143427195, random_state=42, max_iter=100000).fit(X_train, y_explosions_train)
y_explosions_preds_lasso = lasso_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_explosions_test, y_explosions_preds_lasso))))

RMSE when Optimal Model is Fit to the Test Set: 1.7357


'''
pickle.dump(lasso_best, open(export_path + "lasso_best_y_explosions.pkl", "wb"))


'''
pickle.dump(lasso_best, open(export_path + "lasso_best_y_explosions.pkl", "wb"))


from sklearn.ensemble import RandomForestRegressor


rf_pipe = make_pipeline(RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10, min_samples_leaf=1, max_features="sqrt"))
print(rf_pipe.steps)

[('randomforestregressor', RandomForestRegressor(max_depth=10, max_features='sqrt', random_state=42))]


tscv = TimeSeriesSplit(n_splits=5)

param_grid = {'randomforestregressor__n_estimators': [1000, 2000, 3000, 4000],
             'randomforestregressor__max_depth': [1, 10, 20, 30],
             'randomforestregressor__min_samples_leaf': [1, 5, 10],
             'randomforestregressor__max_features': [170, 220, 270, 320, 370]}
grid = GridSearchCV(rf_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_explosions_train.values.ravel())
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))

{'randomforestregressor__max_depth': 10, 'randomforestregressor__max_features': 320, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__n_estimators': 3000}
Mean RMSE when Optimal Model is Fit to Cross Validation Data: -1.5342


rf_best = RandomForestRegressor(random_state=42, n_estimators=3000, max_depth=10, min_samples_leaf=1, max_features=320, n_jobs=-1).fit(X_train, y_explosions_train.values.ravel())
y_explosions_preds_rf = rf_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_explosions_test, y_explosions_preds_rf))))

RMSE when Optimal Model is Fit to the Test Set: 1.7481


'''
pickle.dump(rf_best, open(export_path + "rf_best_y_explosions.pkl", "wb"))


'''
rf_best = pickle.load(open(export_path + rf_best_y_explosions.pkl", "rb"))


xgb_pipe = make_pipeline(XGBRegressor(random_state=42, objective ='reg:squarederror', n_jobs=-1))
print(xgb_pipe.steps)

[('xgbregressor', XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             random_state=42, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None))]


tscv = TimeSeriesSplit(n_splits=5)

param_grid = {'xgbregressor__n_estimators': [50, 100, 150],
             'xgbregressor__learning_rate': [0.05, 0.1, 0.15, 0.2],
             'xgbregressor__max_depth': [4, 6, 8],
             'xgbregressor__min_child_weight': [1, 3, 5, 7]
             }
grid = GridSearchCV(xgb_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_explosions_train.values.ravel())
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))

{'xgbregressor__learning_rate': 0.2, 'xgbregressor__max_depth': 4, 'xgbregressor__min_child_weight': 7, 'xgbregressor__n_estimators': 100}
Mean RMSE when Optimal Model is Fit to Cross Validation Data: -1.4564


xgb_best = XGBRegressor(random_state=42, n_estimators=100, learning_rate=0.2, max_depth=4, min_child_weight=7, n_jobs=-1).fit(X_train, y_explosions_train.values.ravel())
y_explosions_preds_rf = xgb_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_explosions_test, y_explosions_preds_rf))))

RMSE when Optimal Model is Fit to the Test Set: 1.7689


'''
pickle.dump(xgb_best, open(export_path + "xgb_best_y_explosions.pkl", "wb"))


'''
pickle.dump(xgb_best, open(export_path + "xgb_best_y_explosions.pkl", "wb"))


ridge_pipe = make_pipeline(Ridge(random_state=42, max_iter=100000))
print(ridge_pipe.steps)

[('ridge', Ridge(max_iter=100000, random_state=42))]


tscv = TimeSeriesSplit(n_splits=5)

param_grid = {'ridge__alpha': np.logspace(-3, 0.5, 200)}
grid = GridSearchCV(ridge_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_protests_train)
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))

{'ridge__alpha': 3.1622776601683795}
Mean RMSE when Optimal Model is Fit to Cross Validation Data: -108.2137


ridge_best = Ridge(alpha= 3.1622776601683795, random_state=42, max_iter=100000).fit(X_train, y_protests_train)
y_protests_preds_ridge = ridge_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_protests_test, y_protests_preds_ridge))))

RMSE when Optimal Model is Fit to the Test Set: 116.6480


'''
pickle.dump(ridge_best, open(export_path + "ridge_best_y_protests.pkl", "wb"))


'''
ridge_best = pickle.load(open(export_path + "ridge_best_y_protests.pkl", "rb"))


lasso_pipe = make_pipeline(Lasso(random_state=42, max_iter=100000))
print(lasso_pipe.steps)

[('lasso', Lasso(max_iter=100000, random_state=42))]


tscv = TimeSeriesSplit(n_splits=5)

param_grid = {'lasso__alpha': np.logspace(-3, 0.5, 200)}
grid = GridSearchCV(lasso_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_protests_train)
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))

{'lasso__alpha': 1.7225859653987856}
Mean RMSE when Optimal Model is Fit to Cross Validation Data: -107.6887


lasso_best = Lasso(alpha=1.7225859653987856, random_state=42, max_iter=100000).fit(X_train, y_protests_train)
y_protests_preds_lasso = lasso_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_protests_test, y_protests_preds_lasso))))

RMSE when Optimal Model is Fit to the Test Set: 115.6709


'''
pickle.dump(lasso_best, open(export_path + "lasso_best_y_protests.pkl", "wb"))


'''
lasso_best = pickle.load(open(export_path + "lasso_best_y_protests.pkl", "rb"))


from sklearn.ensemble import RandomForestRegressor


rf_pipe = make_pipeline(RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10, min_samples_leaf=1, max_features="sqrt"))
print(rf_pipe.steps)

[('randomforestregressor', RandomForestRegressor(max_depth=10, max_features='sqrt', random_state=42))]


tscv = TimeSeriesSplit(n_splits=5)

param_grid = {'randomforestregressor__n_estimators': [1000, 2000, 3000, 4000],
             'randomforestregressor__max_depth': [1, 10, 20, 30],
             'randomforestregressor__min_samples_leaf': [1, 5, 10],
             'randomforestregressor__max_features': [170, 220, 270, 320, 370]}
grid = GridSearchCV(rf_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_protests_train.values.ravel())
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))

{'randomforestregressor__max_depth': 1, 'randomforestregressor__max_features': 320, 'randomforestregressor__min_samples_leaf': 10, 'randomforestregressor__n_estimators': 2000}
Mean RMSE when Optimal Model is Fit to Cross Validation Data: -107.4333


rf_best = RandomForestRegressor(random_state=42, n_estimators=2000, max_depth=1, min_samples_leaf=10, max_features=320, n_jobs=-1).fit(X_train, y_protests_train.values.ravel())
y_protests_preds_rf = rf_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_protests_test, y_protests_preds_rf))))

RMSE when Optimal Model is Fit to the Test Set: 117.9724


'''
pickle.dump(rf_best, open(export_path + "rf_best_y_protests.pkl", "wb"))


'''
rf_best = pickle.load(open(export_path + "rf_best_y_protests.pkl", "rb"))


xgb_pipe = make_pipeline(XGBRegressor(random_state=42, objective ='reg:squarederror', n_jobs=-1))
print(xgb_pipe.steps)

[('xgbregressor', XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             random_state=42, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None))]


tscv = TimeSeriesSplit(n_splits=5)

param_grid = {'xgbregressor__n_estimators': [50, 100, 150],
             'xgbregressor__learning_rate': [0.05, 0.1, 0.15, 0.2],
             'xgbregressor__max_depth': [4, 6, 8],
             'xgbregressor__min_child_weight': [3, 5, 7]
             }
grid = GridSearchCV(xgb_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_protests_train.values.ravel())
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))

{'xgbregressor__learning_rate': 0.05, 'xgbregressor__max_depth': 4, 'xgbregressor__min_child_weight': 7, 'xgbregressor__n_estimators': 50}
Mean RMSE when Optimal Model is Fit to Cross Validation Data: -116.9903


xgb_best = XGBRegressor(random_state=42, n_estimators= 50, learning_rate=0.05, max_depth=4, min_child_weight=7, n_jobs=-1).fit(X_train, y_protests_train.values.ravel())
y_protests_preds_rf = xgb_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_protests_test, y_protests_preds_rf))))

RMSE when Optimal Model is Fit to the Test Set: 124.2078


'''
pickle.dump(xgb_best, open(export_path + "xgb_best_y_protests.pkl", "wb"))


'''
pickle.dump(xgb_best, open(export_path + "xgb_best_y_protests.pkl", "wb"))


ridge_pipe = make_pipeline(Ridge(random_state=42, max_iter=100000))
print(ridge_pipe.steps)

[('ridge', Ridge(max_iter=100000, random_state=42))]


tscv = TimeSeriesSplit(n_splits=5)

param_grid = {'ridge__alpha': np.logspace(-3, 0.5, 200)}
grid = GridSearchCV(ridge_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_riots_train)
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))

{'ridge__alpha': 3.1622776601683795}
Mean RMSE when Optimal Model is Fit to Cross Validation Data: -2.5179


ridge_best = Ridge(alpha= 3.1622776601683795, random_state=42, max_iter=100000).fit(X_train, y_riots_train)
y_riots_preds_ridge = ridge_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_riots_test, y_riots_preds_ridge))))

RMSE when Optimal Model is Fit to the Test Set: 2.2682


'''
pickle.dump(ridge_best, open(export_path + "ridge_best_y_riots.pkl", "wb"))


'''
ridge_best = pickle.load(open(export_path + "ridge_best_y_riots.pkl", "rb"))


lasso_pipe = make_pipeline(Lasso(random_state=42, max_iter=100000))
print(lasso_pipe.steps)

[('lasso', Lasso(max_iter=100000, random_state=42))]


tscv = TimeSeriesSplit(n_splits=5)

param_grid = {'lasso__alpha': np.logspace(-3, 0.5, 200)}
grid = GridSearchCV(lasso_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_riots_train)
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))

{'lasso__alpha': 0.07934096665797492}
Mean RMSE when Optimal Model is Fit to Cross Validation Data: -2.4961


lasso_best = Lasso(alpha=0.07934096665797492, random_state=42, max_iter=100000).fit(X_train, y_riots_train)
y_riots_preds_lasso = lasso_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_riots_test, y_riots_preds_lasso))))

RMSE when Optimal Model is Fit to the Test Set: 2.2411


'''
pickle.dump(lasso_best, open(export_path + "lasso_best_y_riots.pkl", "wb"))


'''
lasso_best = pickle.load(open(export_path + "lasso_best_y_riots.pkl", "rb"))


from sklearn.ensemble import RandomForestRegressor


rf_pipe = make_pipeline(RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10, min_samples_leaf=1, max_features="sqrt"))
print(rf_pipe.steps)

[('randomforestregressor', RandomForestRegressor(max_depth=10, max_features='sqrt', random_state=42))]


tscv = TimeSeriesSplit(n_splits=5)

param_grid = {'randomforestregressor__n_estimators': [1000, 2000, 3000, 4000],
             'randomforestregressor__max_depth': [1, 10, 20, 30],
             'randomforestregressor__min_samples_leaf': [1, 5, 10],
             'randomforestregressor__max_features': [170, 220, 270, 320, 370]}
grid = GridSearchCV(rf_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_riots_train.values.ravel())
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))

{'randomforestregressor__max_depth': 1, 'randomforestregressor__max_features': 370, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__n_estimators': 2000}
Mean RMSE when Optimal Model is Fit to Cross Validation Data: -2.3903


rf_best = RandomForestRegressor(random_state=42, n_estimators=2000, max_depth=1, min_samples_leaf=1, max_features=370, n_jobs=-1).fit(X_train, y_riots_train.values.ravel())
y_riots_preds_rf = rf_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_riots_test, y_riots_preds_rf))))

RMSE when Optimal Model is Fit to the Test Set: 2.1061


'''
pickle.dump(rf_best, open(export_path + "rf_best_y_riots.pkl", "wb"))


'''
rf_best = pickle.load(open(export_path + "rf_best_y_riots.pkl", "rb"))


xgb_pipe = make_pipeline(XGBRegressor(random_state=42, objective ='reg:squarederror', n_jobs=-1))
print(xgb_pipe.steps)

[('xgbregressor', XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             random_state=42, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None))]


tscv = TimeSeriesSplit(n_splits=5)

param_grid = {'xgbregressor__n_estimators': [50, 100, 150],
             'xgbregressor__learning_rate': [0.05, 0.1, 0.15, 0.2],
             'xgbregressor__max_depth': [4, 6, 8],
             'xgbregressor__min_child_weight': [3, 5, 7]
             }
grid = GridSearchCV(xgb_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_riots_train.values.ravel())
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))

{'xgbregressor__learning_rate': 0.05, 'xgbregressor__max_depth': 4, 'xgbregressor__min_child_weight': 5, 'xgbregressor__n_estimators': 50}
Mean RMSE when Optimal Model is Fit to Cross Validation Data: -2.1890


xgb_best = XGBRegressor(random_state=42, n_estimators=50, learning_rate=0.05, max_depth=4, min_child_weight=5, n_jobs=-1).fit(X_train, y_riots_train.values.ravel())
y_riots_preds_rf = xgb_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_riots_test, y_riots_preds_rf))))

RMSE when Optimal Model is Fit to the Test Set: 1.9828


'''
pickle.dump(xgb_best, open(export_path + "xgb_best_y_riots.pkl", "wb"))


'''
pickle.dump(xgb_best, open(export_path + "xgb_best_y_riots.pkl", "wb"))


ridge_pipe = make_pipeline(Ridge(random_state=42, max_iter=100000))
print(ridge_pipe.steps)

[('ridge', Ridge(max_iter=100000, random_state=42))]


tscv = TimeSeriesSplit(n_splits=5)

param_grid = {'ridge__alpha': np.logspace(-3, 0.5, 200)}
grid = GridSearchCV(ridge_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_vac_train)
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))

{'ridge__alpha': 0.02171117945694504}
Mean RMSE when Optimal Model is Fit to Cross Validation Data: -119.6204


ridge_best = Ridge(alpha=0.02171117945694504, random_state=42, max_iter=100000).fit(X_train, y_vac_train)
y_vac_preds_ridge = ridge_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_vac_test, y_vac_preds_ridge))))

RMSE when Optimal Model is Fit to the Test Set: 157.2134


'''
pickle.dump(ridge_best, open(export_path + "ridge_best_y_vac.pkl", "wb"))


'''
ridge_best = pickle.load(open(export_path + "ridge_best_y_vac.pkl", "rb"))


lasso_pipe = make_pipeline(Lasso(random_state=42, max_iter=100000))
print(lasso_pipe.steps)

[('lasso', Lasso(max_iter=100000, random_state=42))]


tscv = TimeSeriesSplit(n_splits=5)

param_grid = {'lasso__alpha': np.logspace(-3, 0.5, 200)}
grid = GridSearchCV(lasso_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_vac_train)
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))

{'lasso__alpha': 0.16446761779946645}
Mean RMSE when Optimal Model is Fit to Cross Validation Data: -118.2845


lasso_best = Lasso(alpha=0.16446761779946645, random_state=42, max_iter=100000).fit(X_train, y_vac_train)
y_vac_preds_lasso = lasso_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_vac_test, y_vac_preds_lasso))))

RMSE when Optimal Model is Fit to the Test Set: 159.3633


'''
pickle.dump(lasso_best, open(export_path + "lasso_best_y_vac.pkl", "wb"))


'''
lasso_best = pickle.load(open(export_path + "lasso_best_y_vac.pkl", "rb"))


from sklearn.ensemble import RandomForestRegressor


rf_pipe = make_pipeline(RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10, min_samples_leaf=1, max_features="sqrt"))
print(rf_pipe.steps)

[('randomforestregressor', RandomForestRegressor(max_depth=10, max_features='sqrt', random_state=42))]


tscv = TimeSeriesSplit(n_splits=5)

param_grid = {'randomforestregressor__n_estimators': [1000, 2000, 3000, 4000],
             'randomforestregressor__max_depth': [1, 10, 20, 30],
             'randomforestregressor__min_samples_leaf': [1, 5, 10],
             'randomforestregressor__max_features': [170, 220, 270, 320, 370]}
grid = GridSearchCV(rf_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_vac_train.values.ravel())
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))

{'randomforestregressor__max_depth': 10, 'randomforestregressor__max_features': 270, 'randomforestregressor__min_samples_leaf': 5, 'randomforestregressor__n_estimators': 2000}
Mean RMSE when Optimal Model is Fit to Cross Validation Data: -127.2781


rf_best = RandomForestRegressor(random_state=42, n_estimators=2000, max_depth=10, min_samples_leaf=5, max_features=270, n_jobs=-1).fit(X_train, y_vac_train.values.ravel())
y_vac_preds_rf = rf_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_vac_test, y_vac_preds_rf))))

RMSE when Optimal Model is Fit to the Test Set: 121.5632


'''
pickle.dump(rf_best, open(export_path + "rf_best_y_vac.pkl", "wb"))


'''
rf_best = pickle.load(open(export_path + "rf_best_y_vac.pkl", "rb"))


xgb_pipe = make_pipeline(XGBRegressor(random_state=42, objective ='reg:squarederror', n_jobs=-1))
print(xgb_pipe.steps)

[('xgbregressor', XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             random_state=42, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None))]


tscv = TimeSeriesSplit(n_splits=5)

param_grid = {'xgbregressor__n_estimators': [50, 100, 150],
             'xgbregressor__learning_rate': [0.05, 0.1, 0.15, 0.2],
             'xgbregressor__max_depth': [4, 6, 8],
             'xgbregressor__min_child_weight': [3, 5, 7]
             }
grid = GridSearchCV(xgb_pipe, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_vac_train.values.ravel())
print(grid.best_params_)
print('Mean RMSE when Optimal Model is Fit to Cross Validation Data: {:.4f}'.format(grid.best_score_))

{'xgbregressor__learning_rate': 0.15, 'xgbregressor__max_depth': 4, 'xgbregressor__min_child_weight': 7, 'xgbregressor__n_estimators': 150}
Mean RMSE when Optimal Model is Fit to Cross Validation Data: -123.4189


xgb_best = XGBRegressor(random_state=42, n_estimators=150, learning_rate=0.15, max_depth=4, min_child_weight=7, n_jobs=-1).fit(X_train, y_vac_train.values.ravel())
y_vac_preds_xgb = xgb_best.predict(X_test)
print('RMSE when Optimal Model is Fit to the Test Set: {:.4f}'.format(sqrt(mean_squared_error(y_vac_test, y_vac_preds_xgb))))

RMSE when Optimal Model is Fit to the Test Set: 158.5494


'''
pickle.dump(xgb_best, open(export_path + "xgb_best_y_vac.pkl", "wb"))


'''
pickle.dump(xgb_best, open(export_path + "xgb_best_y_vac.pkl", "wb"))

	week	event_type	sub_event_type
0	1	Battles	Armed clash
1	1	Battles	Armed clash
2	1	Explosions/Remote violence	Air/drone strike
3	1	Violence against civilians	Attack
4	1	Battles	Armed clash

	event_type_Battles	event_type_Explosions/Remote violence	event_type_Violence against civilians
0	1	0	0
1	1	0	0
2	0	1	0
3	0	0	1
4	1	0	0

	week	event_type_Battles	event_type_Explosions/Remote violence	event_type_Violence against civilians
0	1	1	0	0
1	1	1	0	0
2	1	0	1	0
3	1	0	0	1
4	1	1	0	0

	event_type_Battles	event_type_Explosions/Remote violence	event_type_Protests	event_type_Riots	event_type_Violence against civilians
week
1	18	4	0	1	5
2	10	0	0	1	6
3	11	0	0	7	11
4	17	1	1	6	6
5	10	3	0	5	13

	week	event_type_Battles	event_type_Explosions/Remote violence	event_type_Protests	event_type_Riots	event_type_Violence against civilians
3	4	17	1	1	6	6
4	5	10	3	0	5	13
5	6	15	1	0	10	24
6	7	13	2	0	4	49
7	8	21	2	0	4	27

Portfolio Project: Using News Articles on Events in Nigeria in 2019 to Predict Political Violence¶

Note: I used Excel's =WEEKNUM(date) to create an extra column in which the date was converted into the week number of the year. I did so so that I could later group corresponding weekly observations of news text-based features to predict the occurence of event_types in the ACLED dataset.¶

Here, I one hot encode each event type categorical variable (using pandas get_dummies) so that my machine learning models can process them:¶

Here, I group all event type variable counts, summed by the corresponding week the events occurred in:¶

Now, I import the news articles and news snippets on events in Nigeria in 2019, and begin to prepare each corpus to be grouped by week of the year¶

Thus, to make sure our target variables have the same number of rows as our feature variables, I will delete weeks 1, 2, and 3 from the event type data below. This will ultimately leave us with 50 remaining rows (weeks) of data for both the features and the targets:¶

Here, I create some functions in python that ensure that all text in the dataframe is lowercase, and that stopwords are removed:¶

A brief note on Term Frequency-Inverse Document Frequency (TF-IDF) Matrices:¶

The overall TF-IDF score ranges between 0 and 1, with higher scores implying greater word relevance.¶

Below, I create a function that transforms the text using TF-IDF into numerical features that can be fit to machine learning models:¶

The problems of non-stationarity and serial autocorrelation:¶

With nearly 37,000 features I assume that non-stationarity and serial autocorrelation exist in my TF-IDF transfromed dataset.¶

Below, I first difference all of my TF-IDF transformed features:¶

Investigating non-stationarity in the dependent variable:¶

For the other target variables (battles, explosions or remote violence, and riots), we will be trying to predict the overall number of such events based on weekly changes in the values of the features used in the model.¶

Diagrams are from: https://www.kaggle.com/code/tomwarrens/timeseriessplit-how-to-use-it/notebook¶

Target: y_battles¶

Algorithm: Ridge¶

Target: y_battles¶

Algorithm: Lasso¶

Target: y_battles¶

Algorithm: Random Forest Regression¶

Target: y_battles¶

Algorithm: XGBoost Regression¶

Target: y_explosions¶

Algorithm: Ridge¶

Target: y_explosions¶

Algorithm: Lasso¶

Target: y_explosions¶

Algorithm: Random Forest Regression¶

Target: y_explosions¶

Algorithm: XGBoost Regression¶

Target: y_protests¶

Algorithm: Ridge¶

Target: y_protests¶

Algorithm: Lasso¶

Target: y_protests¶

Algorithm: Random Forest Regression¶

Target: y_protests¶

Algorithm: XGBoost Regression¶

Target: y_riots¶

Algorithm: Ridge¶

Target: y_riots¶

Algorithm: Lasso¶

Target: y_riots¶

Algorithm: Random Forest Regression¶

Target: y_riots¶

Algorithm: XGBoost Regression¶

Target: y_vac¶

Algorithm: Ridge¶

Target: y_vac¶

Algorithm: Lasso¶

Target: y_vac¶

Algorithm: Random Forest Regression¶

Target: y_riots¶

Algorithm: XGBoost Regression¶

Summary:¶

Diagrams are from: https://www.kaggle.com/code/tomwarrens/timeseriessplit-how-to-use-it/notebook ¶