|
|
@ -9,11 +9,14 @@ from math import sqrt
|
|
|
|
import json
|
|
|
|
import json
|
|
|
|
import os.path
|
|
|
|
import os.path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#from birdseye import eye
|
|
|
|
|
|
|
|
|
|
|
|
# Get the directory of the current script to use in importing data
|
|
|
|
# Get the directory of the current script to use in importing data
|
|
|
|
# and exporting the model.
|
|
|
|
# and exporting the model.
|
|
|
|
basepath = os.path.dirname(os.path.realpath(__file__))
|
|
|
|
basepath = os.path.dirname(os.path.realpath(__file__))
|
|
|
|
|
|
|
|
|
|
|
|
# Load a CSV file. Definition of the function to read the csv and create dataset here
|
|
|
|
# Load a CSV file. Definition of the function to read the csv and create dataset here
|
|
|
|
|
|
|
|
#@eye
|
|
|
|
def load_csv(filename):
|
|
|
|
def load_csv(filename):
|
|
|
|
dataset = list()
|
|
|
|
dataset = list()
|
|
|
|
with open(filename, 'r') as file:
|
|
|
|
with open(filename, 'r') as file:
|
|
|
@ -25,11 +28,13 @@ def load_csv(filename):
|
|
|
|
return dataset
|
|
|
|
return dataset
|
|
|
|
|
|
|
|
|
|
|
|
# Convert string column to float - original dataset is in string format
|
|
|
|
# Convert string column to float - original dataset is in string format
|
|
|
|
|
|
|
|
#@eye
|
|
|
|
def str_column_to_float(dataset, column):
|
|
|
|
def str_column_to_float(dataset, column):
|
|
|
|
for row in dataset:
|
|
|
|
for row in dataset:
|
|
|
|
row[column] = float(row[column].strip())
|
|
|
|
row[column] = float(row[column].strip())
|
|
|
|
|
|
|
|
|
|
|
|
# Convert string column to integer / transforms classes 'mine' and 'rock' into 1 and 2
|
|
|
|
# Convert string column to integer / transforms classes 'mine' and 'rock' into 1 and 2
|
|
|
|
|
|
|
|
#@eye
|
|
|
|
def str_column_to_int(dataset, column):
|
|
|
|
def str_column_to_int(dataset, column):
|
|
|
|
# extracts values of the classes of the dataset: array of all the mine, rock
|
|
|
|
# extracts values of the classes of the dataset: array of all the mine, rock
|
|
|
|
class_values = [row[column] for row in dataset]
|
|
|
|
class_values = [row[column] for row in dataset]
|
|
|
@ -49,6 +54,7 @@ def str_column_to_int(dataset, column):
|
|
|
|
return lookup
|
|
|
|
return lookup
|
|
|
|
|
|
|
|
|
|
|
|
# Split a dataset into k folds
|
|
|
|
# Split a dataset into k folds
|
|
|
|
|
|
|
|
#@eye
|
|
|
|
def cross_validation_split(dataset, n_folds):
|
|
|
|
def cross_validation_split(dataset, n_folds):
|
|
|
|
# creates list
|
|
|
|
# creates list
|
|
|
|
dataset_split = list()
|
|
|
|
dataset_split = list()
|
|
|
@ -70,6 +76,7 @@ def cross_validation_split(dataset, n_folds):
|
|
|
|
return dataset_split #return the dataset_split, a list of folds
|
|
|
|
return dataset_split #return the dataset_split, a list of folds
|
|
|
|
|
|
|
|
|
|
|
|
# Calculate accuracy percentage
|
|
|
|
# Calculate accuracy percentage
|
|
|
|
|
|
|
|
#@eye
|
|
|
|
def accuracy_metric(actual, predicted):
|
|
|
|
def accuracy_metric(actual, predicted):
|
|
|
|
correct = 0
|
|
|
|
correct = 0
|
|
|
|
# loops through index list which has length of actual classes
|
|
|
|
# loops through index list which has length of actual classes
|
|
|
@ -82,6 +89,7 @@ def accuracy_metric(actual, predicted):
|
|
|
|
return correct / float(len(actual)) * 100.0
|
|
|
|
return correct / float(len(actual)) * 100.0
|
|
|
|
|
|
|
|
|
|
|
|
# Evaluate an algorithm using a cross validation split
|
|
|
|
# Evaluate an algorithm using a cross validation split
|
|
|
|
|
|
|
|
#@eye
|
|
|
|
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
|
|
|
|
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
|
|
|
|
# split dataset in n folds
|
|
|
|
# split dataset in n folds
|
|
|
|
folds = cross_validation_split(dataset, n_folds)
|
|
|
|
folds = cross_validation_split(dataset, n_folds)
|
|
|
@ -120,6 +128,7 @@ def evaluate_algorithm(dataset, algorithm, n_folds, *args):
|
|
|
|
|
|
|
|
|
|
|
|
# Split a dataset based on a feature and a feature value defined in build tree
|
|
|
|
# Split a dataset based on a feature and a feature value defined in build tree
|
|
|
|
# just trying many times, benefitting from speed of computer
|
|
|
|
# just trying many times, benefitting from speed of computer
|
|
|
|
|
|
|
|
#@eye
|
|
|
|
def test_split(index, value, dataset):
|
|
|
|
def test_split(index, value, dataset):
|
|
|
|
left, right = list(), list()
|
|
|
|
left, right = list(), list()
|
|
|
|
for row in dataset:
|
|
|
|
for row in dataset:
|
|
|
@ -134,6 +143,7 @@ def test_split(index, value, dataset):
|
|
|
|
|
|
|
|
|
|
|
|
# Calculate the Gini index for a split dataset, using left/right og test split as groups
|
|
|
|
# Calculate the Gini index for a split dataset, using left/right og test split as groups
|
|
|
|
# cfr calculating wealth distribution: https://en.wikipedia.org/wiki/Gini_coefficient
|
|
|
|
# cfr calculating wealth distribution: https://en.wikipedia.org/wiki/Gini_coefficient
|
|
|
|
|
|
|
|
#@eye
|
|
|
|
def gini_index(groups, classes):
|
|
|
|
def gini_index(groups, classes):
|
|
|
|
# count all samples at split point (the dataset), converts it in a float in order to do divisions
|
|
|
|
# count all samples at split point (the dataset), converts it in a float in order to do divisions
|
|
|
|
n_instances = float(sum([len(group) for group in groups]))
|
|
|
|
n_instances = float(sum([len(group) for group in groups]))
|
|
|
@ -158,6 +168,7 @@ def gini_index(groups, classes):
|
|
|
|
return gini
|
|
|
|
return gini
|
|
|
|
|
|
|
|
|
|
|
|
# Select the best split point for a dataset
|
|
|
|
# Select the best split point for a dataset
|
|
|
|
|
|
|
|
#@eye
|
|
|
|
def get_split(dataset, n_features):
|
|
|
|
def get_split(dataset, n_features):
|
|
|
|
# takes last element of each row (class) and returns it as a row, as it is a set, it has only 2 values
|
|
|
|
# takes last element of each row (class) and returns it as a row, as it is a set, it has only 2 values
|
|
|
|
class_values = list(set(row[-1] for row in dataset))
|
|
|
|
class_values = list(set(row[-1] for row in dataset))
|
|
|
@ -187,6 +198,7 @@ def get_split(dataset, n_features):
|
|
|
|
return {'index':b_index, 'value':b_value, 'groups':b_groups, 'gini':b_score}
|
|
|
|
return {'index':b_index, 'value':b_value, 'groups':b_groups, 'gini':b_score}
|
|
|
|
|
|
|
|
|
|
|
|
# Create a terminal node value = node at end of the tree = end leaf
|
|
|
|
# Create a terminal node value = node at end of the tree = end leaf
|
|
|
|
|
|
|
|
#@eye
|
|
|
|
def to_terminal(group):
|
|
|
|
def to_terminal(group):
|
|
|
|
# returns list of classes of group
|
|
|
|
# returns list of classes of group
|
|
|
|
outcomes = [row[-1] for row in group]
|
|
|
|
outcomes = [row[-1] for row in group]
|
|
|
@ -195,6 +207,7 @@ def to_terminal(group):
|
|
|
|
return max(set(outcomes), key=outcomes.count)
|
|
|
|
return max(set(outcomes), key=outcomes.count)
|
|
|
|
|
|
|
|
|
|
|
|
# Counts the amount of unique values in a 'group' (rows in dataset)
|
|
|
|
# Counts the amount of unique values in a 'group' (rows in dataset)
|
|
|
|
|
|
|
|
#@eye
|
|
|
|
def count_unique_values (group):
|
|
|
|
def count_unique_values (group):
|
|
|
|
# Pick classes in the dataset, transform to a set
|
|
|
|
# Pick classes in the dataset, transform to a set
|
|
|
|
# count amount of values
|
|
|
|
# count amount of values
|
|
|
@ -203,6 +216,7 @@ def count_unique_values (group):
|
|
|
|
# Create child splits for a node or make terminals/end leafs
|
|
|
|
# Create child splits for a node or make terminals/end leafs
|
|
|
|
# recursive function, it calls itself
|
|
|
|
# recursive function, it calls itself
|
|
|
|
# node is dictionary returned by get_split (b_index, b_value, b_groups)
|
|
|
|
# node is dictionary returned by get_split (b_index, b_value, b_groups)
|
|
|
|
|
|
|
|
#@eye
|
|
|
|
def split(node, max_depth, min_size, n_features, depth):
|
|
|
|
def split(node, max_depth, min_size, n_features, depth):
|
|
|
|
left, right = node['groups']
|
|
|
|
left, right = node['groups']
|
|
|
|
del(node['groups'])
|
|
|
|
del(node['groups'])
|
|
|
@ -244,6 +258,7 @@ def split(node, max_depth, min_size, n_features, depth):
|
|
|
|
# return no value because functions are working on the same dictionaries
|
|
|
|
# return no value because functions are working on the same dictionaries
|
|
|
|
|
|
|
|
|
|
|
|
# Build a decision tree
|
|
|
|
# Build a decision tree
|
|
|
|
|
|
|
|
#@eye
|
|
|
|
def build_tree(train, max_depth, min_size, n_features):
|
|
|
|
def build_tree(train, max_depth, min_size, n_features):
|
|
|
|
# root of decision tree is defined by dictionary of index, value, 2 groups (left/right of the split)
|
|
|
|
# root of decision tree is defined by dictionary of index, value, 2 groups (left/right of the split)
|
|
|
|
root = get_split(train, n_features)
|
|
|
|
root = get_split(train, n_features)
|
|
|
@ -255,6 +270,7 @@ def build_tree(train, max_depth, min_size, n_features):
|
|
|
|
|
|
|
|
|
|
|
|
# Make a prediction with a decision tree
|
|
|
|
# Make a prediction with a decision tree
|
|
|
|
# recursive function as well
|
|
|
|
# recursive function as well
|
|
|
|
|
|
|
|
#@eye
|
|
|
|
def predict(node, row):
|
|
|
|
def predict(node, row):
|
|
|
|
# node index = column feature, it looks up value for this feature for this row in dataset
|
|
|
|
# node index = column feature, it looks up value for this feature for this row in dataset
|
|
|
|
# compare feature value of row you're checking with feature value of node
|
|
|
|
# compare feature value of row you're checking with feature value of node
|
|
|
@ -278,6 +294,7 @@ def predict(node, row):
|
|
|
|
# Create a random subsample from the dataset with replacement, ratio is called sample_size further on
|
|
|
|
# Create a random subsample from the dataset with replacement, ratio is called sample_size further on
|
|
|
|
# This is called BOOTSTRAPPING: build new datasets from the original data, with the same number of rows
|
|
|
|
# This is called BOOTSTRAPPING: build new datasets from the original data, with the same number of rows
|
|
|
|
# with replacement: after selecting the row we put it back into the data, so it can be selected twice or more
|
|
|
|
# with replacement: after selecting the row we put it back into the data, so it can be selected twice or more
|
|
|
|
|
|
|
|
#@eye
|
|
|
|
def subsample(dataset, ratio):
|
|
|
|
def subsample(dataset, ratio):
|
|
|
|
sample = list()
|
|
|
|
sample = list()
|
|
|
|
# if it is smaller than 1, not all dataset is taken as sample - he uses the full dataset
|
|
|
|
# if it is smaller than 1, not all dataset is taken as sample - he uses the full dataset
|
|
|
@ -288,6 +305,7 @@ def subsample(dataset, ratio):
|
|
|
|
return sample
|
|
|
|
return sample
|
|
|
|
|
|
|
|
|
|
|
|
# Make a prediction with a list of bagged trees
|
|
|
|
# Make a prediction with a list of bagged trees
|
|
|
|
|
|
|
|
#@eye
|
|
|
|
def bagging_predict(trees, row):
|
|
|
|
def bagging_predict(trees, row):
|
|
|
|
# asks the forest to predict class for every row in the test data, this gives list of votes
|
|
|
|
# asks the forest to predict class for every row in the test data, this gives list of votes
|
|
|
|
predictions = [predict(tree, row) for tree in trees]
|
|
|
|
predictions = [predict(tree, row) for tree in trees]
|
|
|
@ -295,6 +313,7 @@ def bagging_predict(trees, row):
|
|
|
|
return max(set(predictions), key=predictions.count)
|
|
|
|
return max(set(predictions), key=predictions.count)
|
|
|
|
|
|
|
|
|
|
|
|
# Random Forest Algorithm
|
|
|
|
# Random Forest Algorithm
|
|
|
|
|
|
|
|
#@eye
|
|
|
|
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features, model_path=None):
|
|
|
|
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features, model_path=None):
|
|
|
|
trees = list()
|
|
|
|
trees = list()
|
|
|
|
for _ in range(n_trees):
|
|
|
|
for _ in range(n_trees):
|
|
|
|