Global Terrorist Attacks

Here Global Terrorism Database (GTD) is used to classify the Terrorist Group which might be involved in an attack based on number of kills, region, weapons used etc.

The tutorial contains a jupyter notebook, which is used to preprocess the data and classify the Terrorist Groups using Machine Learning techniques like Support Vector Machine and Decision Tree with accuracies of 96.55% and 98.27% respectively.

The input and labels of the model are stored in the x_normed.npy and y.npy files respectively.

Further details about the Database can be found in Codebook.pdf file.

import pandas as pd
import numpy as np
import pickle, gzip
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

Removing NA values

df = pd.read_excel('globalterrorismdb_0718dist.xlsx') # reading all data
df.shape

(181691, 135)

df.head()

	iyear	imonth	iday	approxdate	resolution	country	...	dbsource	INT_LOG	INT_IDEO	INT_MISC	INT_ANY	related
0	1970	7	2	NaN	NaT	58	...	PGIS	0	0	0	0	NaN
1	1970	0	0	NaN	NaT	130	...	PGIS	0	1	1	1	NaN
2	1970	1	0	NaN	NaT	160	...	PGIS	-9	-9	1	1	NaN
3	1970	1	0	NaN	NaT	78	...	PGIS	-9	-9	1	1	NaN
4	1970	1	0	NaN	NaT	101	...	PGIS	-9	-9	1	1	NaN

5 rows × 135 columns

df.isnull().any()  # checking columns with null value

eventid               False
iyear                 False
imonth                False
iday                  False
approxdate             True
extended              False
resolution             True
country               False
                      ...  
propextent             True
propextent_txt         True
propvalue              True
propcomment            True
ishostkid              True
nhostkid               True
nhostkidus             True
nhours                 True
INT_MISC              False
INT_ANY               False
related                True
Length: 135, dtype: bool

# To remove all columns having null values > 5000
for i in df.columns:
    if df[i].isnull().sum()>5000:
        df.drop(columns=i, inplace=True)
df.shape

(181691, 41)

# further reduction of unnecessary columns after looking at the Codebook (documentation)
df.drop(columns=['eventid', 'extended', 'country_txt', 'region_txt', 'specificity', 'vicinity',
                 'crit1', 'crit2', 'crit3', 'attacktype1_txt', 'targtype1_txt', 'natlty1_txt',
                 'guncertain1', 'individual', 'weaptype1_txt', 'dbsource', 'INT_MISC'], inplace=True)
df.shape

(181691, 24)

# saving file for future use
df.to_csv('terrorist_data_pruned.csv', index=False)

Filling NA values

df2 = pd.read_csv('terrorist_data_pruned.csv')
df2.head()

	iyear	imonth	iday	country	region	provstate	city	...	natlty1	gname	weaptype1	property	ishostkid	INT_LOG
0	1970	7	2	58	2	-1	Santo Domingo	...	58.0	MANO-D	13	0	0.0	0
1	1970	0	0	130	1	0	Mexico city	...	21.0	23rd of September Communist League	13	0	1.0	0
2	1970	1	0	160	5	1	Unknown	...	217.0	Unknown	13	0	0.0	-9
3	1970	1	0	78	8	2	Athens	...	217.0	Unknown	6	1	0.0	-9
4	1970	1	0	101	4	3	Fukouka	...	217.0	Unknown	8	1	0.0	-9

5 rows × 24 columns

# replacing nan values (refer Codebook.pdf)
df2['doubtterr'].fillna(-9, inplace=True)
df2['multiple'].fillna(0, inplace=True)
df2['natlty1'].fillna(0, inplace=True)
df2['ishostkid'].fillna(-9, inplace=True)
df2['latitude'].fillna(0, inplace=True)
df2['longitude'].fillna(0, inplace=True)

# quantizing dataset
df2['provstate'] = pd.factorize(df2['provstate'])[0]
df2['iyear'] = pd.factorize(df2['iyear'])[0]
df2['city'] = pd.factorize(df2['city'])[0]
df2['country'] = pd.factorize(df2['country'])[0]
df2['target1'] = pd.factorize(df2['target1'])[0]
df2['gname'] = pd.factorize(df2['gname'])[0]
df2['natlty1'] = pd.factorize(df2['natlty1'])[0]

df2.isna().any()

iyear          False
imonth         False
iday           False
country        False
region         False
provstate      False
city           False
latitude       False
longitude      False
doubtterr      False
multiple       False
success        False
suicide        False
attacktype1    False
targtype1      False
target1        False
natlty1        False
gname          False
weaptype1      False
property       False
ishostkid      False
INT_LOG        False
INT_IDEO       False
INT_ANY        False
dtype: bool

# saving file for future use
df2.to_csv('terrorist_data_pruned.csv', index=False)

Normalizing Columns

df = pd.read_csv('terrorist_data_pruned.csv')
df.head()

	imonth	iday	country	region	provstate	city	latitude	...	natlty1	gname	weaptype1	property	ishostkid	INT_LOG
0	7	2	0	2	0	0	18.456792	...	0	0	13	0	0.0	0
1	0	0	1	1	1	1	19.371887	...	1	1	13	0	1.0	0
2	1	0	2	5	2	2	15.478598	...	2	2	13	0	0.0	-9
3	1	0	3	8	3	3	37.997490	...	2	2	6	1	0.0	-9
4	1	0	4	4	4	4	33.580412	...	2	2	8	1	0.0	-9

5 rows × 24 columns

# checking frequency of column values
plt.figure(figsize=(17,10))
i = 0
for col in df.columns:
    if col=='latitude' or col=='longitude' or col=='city' or col=='provstate' or col=='target1' or col=='gname':
        continue
    i += 1
    plt.subplot(6, 3, i)
    num_list = df[col].value_counts()
    plt.bar(num_list.index, num_list.values)
    plt.title(col)
plt.tight_layout()   
plt.show()

# classes having less than 600 samples were dropped from the dataframe to remove skewedness in data
num = df['gname'].value_counts()
res = num.where(num>600).dropna()
res

     82782.0
   7478.0
   5613.0
    4555.0
    3351.0
   3288.0
     2772.0
     2671.0
    2487.0
   2418.0
   2310.0
     2024.0
   1878.0
    1630.0
    1606.0
    1561.0
   1351.0
     1125.0
   1062.0
   1020.0
     895.0
    830.0
     716.0
     639.0
    638.0
     632.0
    624.0
     607.0
Name: gname, dtype: float64

k = []
c = 0
for index, row in df.iterrows():
    if row['gname'] not in list(res.index):
        k.append(index)
    if row['gname']==2:
        c += 1
        if c>5000:
            k.append(index)

df.drop(index=k, inplace=True)
df.shape

(60781, 24)

# converts labels to one-hot encoding
y = pd.get_dummies(df['gname']).values
y.shape

(60781, 28)

# normalizing feature values
x = df.drop(columns='gname')
x = x.values
min_max_scaler = MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df2 = pd.DataFrame(x_scaled)

x_scaled.shape

(60781, 23)

df2.head()

	1	2	3	4	5	6	7	...	17	18	19	22
0	0.083333	0.000000	0.010	0.363636	0.000708	0.000082	0.572701	...	1.000000	0.9	0.9	1.0
1	0.083333	0.000000	0.015	0.636364	0.001062	0.000109	0.782926	...	0.363636	1.0	0.9	1.0
2	0.083333	0.000000	0.020	0.272727	0.001416	0.000136	0.741690	...	0.545455	1.0	0.9	1.0
3	0.083333	0.064516	0.025	0.000000	0.002478	0.000218	0.781007	...	0.363636	1.0	0.9	0.0
4	0.083333	0.258065	0.035	0.636364	0.003540	0.000327	0.819274	...	0.272727	0.9	0.9	1.0

5 rows × 23 columns

# saving file for future use
np.save('x_normed.npy', x_scaled)
np.save('y.npy', y)

Classification

x = np.load('x_normed.npy')
y = np.load('y.npy')

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(54702, 23) (54702, 28) (6079, 23) (6079, 28)

model1 = svm.SVC(C=5, kernel='rbf')
model2 = DecisionTreeClassifier()

model1.fit(x_train, np.argmax(y_train, axis=1))

SVC(C=5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

model2.fit(x_train, np.argmax(y_train, axis=1))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

pred1 = model1.predict(x_test)
acc1 = np.mean(pred1==np.argmax(y_test, axis=1))*100

pred2 = model2.predict(x_test)
acc2 = np.mean(pred2==np.argmax(y_test, axis=1))*100

print('Accuracy SVC: {:.2f}%\nAccuracy Desicion Tree: {:.2f}%'.format(acc1, acc2))

Accuracy SVC: 96.55%
Accuracy Desicion Tree: 98.27%

cm1 = confusion_matrix(np.argmax(y_test, axis=1), pred1)
cm2 = confusion_matrix(np.argmax(y_test, axis=1), pred2)

plt.subplot(1, 2, 1)
plt.imshow(cm1)
plt.title('Confusion Matrix (SVC)')

plt.subplot(1, 2, 2)
plt.imshow(cm2)
plt.title('Confusion Matrix (Decision Tree)')

plt.tight_layout()
plt.show()

pr1 = precision_recall_fscore_support(np.argmax(y_test, axis=1), pred1)
pr2 = precision_recall_fscore_support(np.argmax(y_test, axis=1), pred2)

print('SVC\nPrecision: {:.2f}%\nRecall: {:.2f}%\nF-score: {:.2f}%'.format(np.mean(pr1[0])*100, np.mean(pr1[1])*100, np.mean(pr1[2])*100))
print('\nDecision Tree\nPrecision: {:.2f}%\nRecall: {:.2f}%\nF-score: {:.2f}%'.format(np.mean(pr2[0])*100, np.mean(pr2[1])*100, np.mean(pr2[2])*100))

SVC
Precision: 96.12%
Recall: 94.68%
F-score: 94.52%

Decision Tree
Precision: 97.76%
Recall: 97.95%
F-score: 97.84%

Global Terrorist Attacks

Removing NA values

Filling NA values

Normalizing Columns

Classification

Email

Phone

Address