# Random Forest

# Setting the space
### importing Librairies

In [27]:
from random import seed
from random import randrange
from csv import reader
from math import sqrt
import json


### Load a CSV file. 

Definition of the function to read the csv and create dataset

In [28]:
def load_csv(filename):
	dataset = list()
	with open(filename, 'r') as file:
		csv_reader = reader(file)
		for row in csv_reader:
			if not row:
				continue
			dataset.append(row)
	return dataset


# Preparing the data
### Conversion of certain data
extract the values of the column (here types of Iris)
calculate how many unique class values there are and store them into a set: a list with unique values
Tranform class values into numbers/integers

In [29]:
# Convert string column to float
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())

# Convert string column to integer
def str_column_to_int(dataset, column):
	class_values = [row[column] for row in dataset] # extract the values of the column (here the classes of the dataset, mine and rocks)
	unique = set(class_values) # calculate how many unique class values there are and store them into a set: a list with unique values
	lookup = dict() #create a dictionnary
	for i, value in enumerate(unique): # loops through the set / enumerate gives you a tuple with an index number and a value /common way to get indexes from a list
		lookup[value] = i # the key of the dictonnary is the value: mine or rock/or types of iris; and the value is a number: 0 or 1 (or 2)
	for row in dataset: # loops through the rows of the dataset
		row[column] = lookup[row[column]] #replaces the value of the column: rock or mine/or types of iris, with the index value: 0 or (1 or 2);
	return lookup # the code returns the lookup table


### Function to create a list of folds (divide the dataset into smaller subsets)

In [30]:
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list() #create a list 
	dataset_copy = list(dataset) # creates a list of the dataset. You could use the copy library
	fold_size = int(len(dataset) / n_folds) # the size of the fold is equal to the length of the dataset divided by the amount of folds
	print("fold size:")
	print(fold_size) 
	for i in range(n_folds): #loops the amount of folds : generate another list with numbers from 0 up to n_fold
		fold = list() #create a list
		while len(fold) < fold_size: # as long as the length of the list is inferior to the defined fold size
			index = randrange(len(dataset_copy)) # return a random integer between 0 and the total length of the dataset and store it in index
			fold.append(dataset_copy.pop(index)) # append an observation at the index and removes it from the dataset
		dataset_split.append(fold) # append the fold to the list dataset_split
		#print("______________")
		#print("dataset split:")
		#print(dataset_split)
		#print("______________")
	return dataset_split #return the dataset_split, a list of folds


# Functions definitions
### Calculate accuracy in the prediction

In [31]:
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0 #create a correct variable
	for i in range(len(actual)): # loops up to the length of the actual list
		if actual[i] == predicted[i]: # compares the actual vs the predicted
			correct += 1 #if correct add one to the correct variable. Count the number of correct guesses
	return correct / float(len(actual)) * 100.0 #gives a percentage by dividing the correct guesses by the length of the actual classes and multiply by a hundred
 

### Create a list of score for each algorithm/tree

In [32]:
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds) #return the list of folds
	scores = list() #creates a list called scores
	for fold in folds: #loops through the folds
		train_set = list(folds) #creates a copy of the list of folds 
		train_set.remove(fold) #remove one fold: for testing?
		train_set = sum(train_set, []) # concatenate all folds, a list of lists, into one list. can be done with itertools.chain
		test_set = list() # create another list
		for row in fold: # iterates through fold
			row_copy = list(row) # creates a copy of the list
			test_set.append(row_copy) # append the list to the test_set
			row_copy[-1] = None # set the classification to none. Changes the last column to none.
		predicted = algorithm(train_set, test_set, *args) # what is doing the prediction takes for argument the train and test set and returns prediction for the test set
		actual = [row[-1] for row in fold] # list comprehension: list of actual classes from fold.
		accuracy = accuracy_metric(actual, predicted) # function that compares the actual vs the predicted to give an idea of the accuracy of the prediction
		scores.append(accuracy) #append the accuracy to the list of scores
	return scores

### Create the trees by using different features and figuring out where to split the data at each point

Here the algorithms is running the same function many times to understand where is the best point to divide the  best split point for the dataset. In order to assess it, it uses a coefficient called the Gini Coefficient.
There are three functions:
- the function for dividing the data into two based on a feature
- the function to assess whether this division is resulting in an equal divide and that return a coefficient
- the function to decide between all the different dividing point, which one is the best, which one result in the best gini coefficient

In [33]:
# Split a dataset based on a feature and a feature value defined in build tree
# just trying many times, benefitting from speed of computer
def test_split(index, value, dataset):
	left, right = list(), list()
	for row in dataset:
		# compares set value to all values in that column, if it is smaller, it goes to the left
		# he goes for each value through all dataset again
		if row[index] < value:
			left.append(row)
		# comparing the set value to itself, then it goes to the right
		else:
			right.append(row)
	return left, right

In [34]:

# Calculate the Gini index for a split dataset, using left/right og test split as groups
# cfr calculating wealth distribution: https://en.wikipedia.org/wiki/Gini_coefficient
def gini_index(groups, classes):
	# count all samples at split point (the dataset), converts it in a float in order to do divisions
	n_instances = float(sum([len(group) for group in groups]))
	# sum weighted Gini index for each group
	gini = 0.0
	for group in groups:
		size = float(len(group))
		# avoid divide by zero
		if size == 0:
			continue
		score = 0.0
		# score the group based on the score for each class
		# count number of instances for current class in the group and divide by total size of the group
		for class_val in classes:
			# outcome lies always between 0 and 1
			# for each row it takes the class value and counts how many times the set class value appears, divided by size of the group
			p = [row[-1] for row in group].count(class_val) / size
			# multiply makes it exponentially smaller; you amplify the badness of the score
			score += p * p
		# weight the group score by its relative size (size of group divided by total size of dataset)
		gini += (1.0 - score) * (size / n_instances)
	return gini


# Select the best split point for a dataset
def get_split(dataset, n_features):
	# takes last element of each row (class) and returns it as a row, as it is a set, it has only 2 values
	class_values = list(set(row[-1] for row in dataset))
	# assigning values to variables
	b_index, b_value, b_score, b_groups = 999, 999, 999, None
	# creates list called features
	features = list()
	# as long as features list is not as long as square root of total dataset
	while len(features) < n_features:
		# creates number between 0 and nr of colums (- class)
		index = randrange(len(dataset[0])-1)
		# add column value if not present yet in features, creates only the index with name of the column
		if index not in features:
			features.append(index)
	# for each column name in list features:
	for index in features:
		for row in dataset:
			# take split point, loops through all the points, selecting 1 feature
			groups = test_split(index, row[index], dataset)
			# calculates how 'pure' the split is, whether it gives 2 groups that correspond to 2 classes
			gini = gini_index(groups, class_values)
			# the lower the score is, the better / keep the best score
			# you keep reference to current best option
			if gini < b_score:
				b_index, b_value, b_score, b_groups = index, row[index], gini, groups
	# returns a dictionary
	return {'index':b_index, 'value':b_value, 'groups':b_groups, 'gini':b_score}
 

### Create the child splits or terminal nodes

here we define a function that will build the tree, decide wether the new data point is going to go left or right or to build a new node.

In [35]:

# Create a terminal node value = node at end of the tree = end leaf with its predicted class
def to_terminal(group):
	# returns list of classes of group
	outcomes = [row[-1] for row in group]
	# selects most popular class; list of outcomes is reduced to 0 or 1; key counts the amount of times 0 or 1 occurs
	# selects class based on calculating how many times the class occurs
	return max(set(outcomes), key=outcomes.count)


# Counts the amount of unique values in a 'group' (rows in dataset)
def count_unique_values (group):
  # Pick classes in the dataset, transform to a set
  # count amount of values
    return len(set([row[-1] for row in group]))

In [36]:
# Create child splits for a node or make terminals/end leafs
# recursive function, it calls itself
# node is dictionary returned by get_split (b_index, b_value, b_groups)
def split(node, max_depth, min_size, n_features, depth):
	left, right = node['groups']
	del(node['groups'])
	# check for a no split
	# if one of the groups is empty: left and right are both becoming a number with most popular class of the list
	# decision is prediction whether sample is class 0 or 1
	if not left or not right:
		node['left'] = node['right'] = to_terminal(left + right)
		return
	# check for max depth / how many levels of nodes you want to have
	if depth >= max_depth:
		node['left'], node['right'] = to_terminal(left), to_terminal(right)
		return
    
    
	# process left child
	# if length of left group is smaller or equal to 1
	if len(left) <= min_size:
		# it creates an end leaf
		node['left'] = to_terminal(left)
	else:
    # Test here whether the group has only one class
    # if so it can be a terminal
		# it tries again to find best split for the reduced group that is at left of the tree at that moment
		node['left'] = get_split(left, n_features)
		split(node['left'], max_depth, min_size, n_features, depth+1)
	
  
    # process right child
	if len(right) <= min_size:
		node['right'] = to_terminal(right)
	elif count_unique_values(right) == 1:
    # Test here whether the group has only one class
    # if so it can be a terminal
		node['right'] = right[0][-1]
	else:
		node['right'] = get_split(right, n_features)
		# it tries again to find best split for the reduced group that is at right of the tree at that moment
		split(node['right'], max_depth, min_size, n_features, depth+1)
	# return no value because functions are working on the same dictionaries
 

### Create the decision tree

In [37]:
def build_tree(train, max_depth, min_size, n_features):
	# root of decision tree is defined by dictionary of index, value, 2 groups (left/right of the split)
	root = get_split(train, n_features)
	split(root, max_depth, min_size, n_features, 1)
	#root is a Node
	# Node: {index: int, value: float, gini: float, left: Node|TerminalNode, right: Node|TerminalNode}
	# TerminalNode: {index: int, value: float, gini: float, left: int(class), right: int (class)}
	return root

### Run the prediction for a tree

In [38]:
# Make a prediction with a decision tree
# recursive function as well
def predict(node, row):
	# node index = column feature, it looks up value for this feature for this row in dataset
	# compare feature value of row you're checking with feature value of node
	if row[node['index']] < node['value']:
		# is it node? 
		if isinstance(node['left'], dict):
			# recursive function at the left
			return predict(node['left'], row)
		else:
			# creates final leaf at the left
			return node['left']
	else:
		# is it node?
		if isinstance(node['right'], dict):
			# recursive function at the right
			return predict(node['right'], row)
		else:
			# creates final leaf at the left
			return node['right']

### Bootstrapping: doubling data to fill-in the random forest

In [39]:
# Create a random subsample from the dataset with replacement, ratio is called sample_size further on
# This is called BOOTSTRAPPING: build new datasets from the original data, with the same number of rows
# with replacement: after selecting the row we put it back into the data, so it can be selected twice or more
def subsample(dataset, ratio):
	sample = list()
	# if it is smaller than 1, not all dataset is taken as sample - he uses the full dataset
	n_sample = round(len(dataset) * ratio)
	while len(sample) < n_sample:
		index = randrange(len(dataset))
		sample.append(dataset[index])
	return sample
 
 

### Aggregate the prediction of several trees - the council of trees - see what is their verdict

In [40]:
# Make a prediction with a list of bagged trees
def bagging_predict(trees, row):
	# asks the forest to predict class for every row in the test data, this gives list of votes
	predictions = [predict(tree, row) for tree in trees]
	# it calculates amount of votes for each class, returns most popular class as prediction
	return max(set(predictions), key=predictions.count)

### The random forest algorithm, encompassing the one above

In [41]:

# Random Forest Algorithm
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
	trees = list() #create a list of trees
	for i in range(n_trees): 
		sample = subsample(train, sample_size) # create subsamples 
		tree = build_tree(sample, max_depth, min_size, n_features) #build the tree
		trees.append(tree)#append the list       
	with open("model.json", "w") as out_file:  
		json.dump(trees, out_file, indent = 6)
	predictions = [bagging_predict(trees, row) for row in test] # testing with test data, running the prediction on every row. THE FOREST VOTES ON EVERY ROW
	return(predictions) # we return the predicted class of each of the rows as a list. THE PREDICTIONS OF THE FOREST
 

# Running the random forest with our data

In [42]:
from random import random

In [43]:
# Test the random forest algorithm
seed(2) #put the random generator in a certain state -> makes it deterministic otherwise python takes the time of the computer
#print(random())
#print(random())
#seed(2)
#print(random())



In [44]:
# load and prepare data
filename = 'iris_data.csv'
#filename = 'iris.csv'

dataset = load_csv(filename)
#print(dataset)


In [45]:
# convert string attributes to integers
for i in range(0, len(dataset[0])-1):
	str_column_to_float(dataset, i)
# convert class column to integers
#str_column_to_int(dataset, len(dataset[0])-1) #this function loops through the rows and transforms the words mine and roch into 1 and 0
#print(dataset)

In [46]:
# evaluate algorithm
n_folds = 5 #the data is randomly sampled into 5 subsamples, one is kept for testing the 4 else are used for training.
max_depth = 10 #
min_size = 1
sample_size = 1.0 #fixed, can be changed to reduces the amount of subsampling, if it is smaller than one.


In [47]:
n_features = int(sqrt(len(dataset[0])-1)) #it specifies the size of the feature subset for the folds, where the size is close to the square root of the total number of features
for n_trees in [1, 5, 10]: # will loop three times by taking n_trees equals one, n_trees equals five and n_trees equals ten successively.
	scores = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features)
	print('Trees: %d' % n_trees)
	print('Scores: %s' % scores)
	print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

fold size:
30
Trees: 1
Scores: [100.0, 93.33333333333333, 96.66666666666667, 100.0, 90.0]
Mean Accuracy: 96.000%
fold size:
30
Trees: 5
Scores: [96.66666666666667, 93.33333333333333, 90.0, 100.0, 96.66666666666667]
Mean Accuracy: 95.333%
fold size:
30
Trees: 10
Scores: [96.66666666666667, 90.0, 93.33333333333333, 96.66666666666667, 96.66666666666667]
Mean Accuracy: 94.667%


### checking the model with a specific instance

In [50]:
with open ("model.json", "r") as in_file:
	trees=json.load(in_file)
	prediction=bagging_predict(trees,dataset[55])
	print(prediction)

Iris Versicolour
