Random-forest/commenting_code_model/random_forest_model_altered.py

# Using this tutorial : From scratch: https://machinelearningmastery.com/implement-random-forest-scratch-python/
# Explanation on decision trees & random forest: https://pad.constantvzw.org/p/anais_berck_frart_Meise_random_forest

# Random Forest Algorithm on Sonar Dataset
from random import seed
from random import randrange
from csv import reader
from math import sqrt
import json
import os.path

#from birdseye import eye

# Get the directory of the current script to use in importing data
# and exporting the model.
basepath = os.path.dirname(os.path.realpath(__file__))

# Load a CSV file. Definition of the function to read the csv and create dataset here
#@eye
def load_csv(filename):
	dataset = list()
	with open(filename, 'r') as file:
		csv_reader = reader(file)
		for row in csv_reader:
			if not row:
				continue
			dataset.append(row)
	return dataset
 
# Convert string column to float - original dataset is in string format
#@eye
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())
 
# Convert string column to integer / transforms classes 'mine' and 'rock' into 1 and 2
#@eye
def str_column_to_int(dataset, column):
	# extracts values of the classes of the dataset: array of all the mine, rock
	class_values = [row[column] for row in dataset]
	# it transforms array into a set of 2 strings, mine and rock
	unique = set(class_values)
	# create dictionary
	lookup = dict()
	# loops through the set and creates dictionary with key (value: rock or mine) and the value is a number
	# destruct tuple by using enumerate with 2 variables i, value - useful if you want to get indexes from a list
	# loops through the set / enumerate gives you a tuple with an index number and a value /common way to get indexes from a list
	for i, value in enumerate(unique):
		lookup[value] = i # the key of the dictonnary is the value: mine or rock; and the value is a number: 0 or 1
	# loops through rows of dataset, replaces the name of class by number/index value
	for row in dataset:
		row[column] = lookup[row[column]]
	# code returns lookup table
	return lookup
 
# Split a dataset into k folds
#@eye
def cross_validation_split(dataset, n_folds):
	# creates list
	dataset_split = list()
	# copies the dataset (which is a list) using the list function
	dataset_copy = list(dataset)
	# size of the fold = length of dataset / amount of folds (ex. 5)
	fold_size = int(len(dataset) / n_folds)
	# loops through list of numbers from 0 up to n_fold
	for i in range(n_folds):
		fold = list()
		# as long as length of list is inferior to defined fold size
		while len(fold) < fold_size:
			# generates random integer between 0 and total length of dataset and store it in index
			index = randrange(len(dataset_copy))
			# pop removes element from list and returns the value, adds value to fold
			fold.append(dataset_copy.pop(index))
		# adds fold to list of folds, each fold has different observations
		dataset_split.append(fold)
	return dataset_split #return the dataset_split, a list of folds
 
# Calculate accuracy percentage
#@eye
def accuracy_metric(actual, predicted):
	correct = 0
	# loops through index list which has length of actual classes
	for i in range(len(actual)):
		# compares values on same positions in list of actual classes with list of predicted classes
		if actual[i] == predicted[i]:
			# adds one to correct variable to count number of correct guesses
			correct += 1
	# gives percentage by dividing correct guesses by length of actual classes divided by 100
	return correct / float(len(actual)) * 100.0
 
# Evaluate an algorithm using a cross validation split
#@eye
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	# split dataset in n folds
	folds = cross_validation_split(dataset, n_folds)
	# creates list called scores
	scores = list()
	for fold in folds:
		# creates copy of dataset_split, list of lists
		train_set = list(folds)
		# removes 1 fold for testing
		train_set.remove(fold)
		# concatenates all lists of remaining folds into 1 list
		train_set = sum(train_set, [])
		# creates a list called test_set
		test_set = list()
		# iterates through test data
		for row in fold:
			# makes copy of the row
			row_copy = list(row)
			# appends copy to a new list called test set
			test_set.append(row_copy)
			# changes last element in the row, which is the class, to remove the number and set it to None
			row_copy[-1] = None
		# he specifies randomforest as an algorithm, which is also a function; this line is 
		# generating the model based on trainset, makes prediction of classes for testset
		predicted = algorithm(train_set, test_set, *args)
		# list comprehension: gives list of actual classes in fold 
		actual = [row[-1] for row in fold]
		# compares actual classes to predicted classes to give idea of accuracy of function
		accuracy = accuracy_metric(actual, predicted)
		# creates a list of scores based on comparison
		scores.append(accuracy)
	return scores
 
# --------


# Split a dataset based on a feature and a feature value defined in build tree
# just trying many times, benefitting from speed of computer
#@eye
def test_split(index, value, dataset):
	left, right = list(), list()
	for row in dataset:
		# compares set value to all values in that column, if it is smaller, it goes to the left
		# he goes for each value through all dataset again
		if row[index] < value:
			left.append(row)
		# comparing the set value to itself, then it goes to the right
		else:
			right.append(row)
	return left, right
 
# Calculate the Gini index for a split dataset, using left/right og test split as groups
# cfr calculating wealth distribution: https://en.wikipedia.org/wiki/Gini_coefficient
#@eye
def gini_index(groups, classes):
	# count all samples at split point (the dataset), converts it in a float in order to do divisions
	n_instances = float(sum([len(group) for group in groups]))
	# sum weighted Gini index for each group
	gini = 0.0
	for group in groups:
		size = float(len(group))
		# avoid divide by zero
		if size == 0:
			continue
		score = 0.0
		# score the group based on the score for each class
		# count number of instances for current class in the group and divide by total size of the group
		for class_val in classes:
			# outcome lies always between 0 and 1
			# for each row it takes the class value and counts how many times the set class value appears, divided by size of the group
			p = [row[-1] for row in group].count(class_val) / size
			# multiply makes it exponentially smaller; you amplify the badness of the score
			score += p * p
		# weight the group score by its relative size (size of group divided by total size of dataset)
		gini += (1.0 - score) * (size / n_instances)
	return gini
 
# Select the best split point for a dataset
#@eye
def get_split(dataset, n_features):
	# takes last element of each row (class) and returns it as a row, as it is a set, it has only 2 values
	class_values = list(set(row[-1] for row in dataset))
	# assigning values to variables
	b_index, b_value, b_score, b_groups = 999, 999, 999, None
	# creates list called features
	features = list()
	# as long as features list is not as long as square root of total dataset
	while len(features) < n_features:
		# creates number between 0 and nr of colums (- class)
		index = randrange(len(dataset[0])-1)
		# add column value if not present yet in features, creates only the index with name of the column
		if index not in features:
			features.append(index)
	# for each column name in list features:
	for index in features:
		for row in dataset:
			# take split point, loops through all the points, selecting 1 feature
			groups = test_split(index, row[index], dataset)
			# calculates how 'pure' the split is, whether it gives 2 groups that correspond to 2 classes
			gini = gini_index(groups, class_values)
			# the lower the score is, the better / keep the best score
			# you keep reference to current best option
			if gini < b_score:
				b_index, b_value, b_score, b_groups = index, row[index], gini, groups
	# returns a dictionary
	return {'index':b_index, 'value':b_value, 'groups':b_groups, 'gini':b_score}
 
# Create a terminal node value = node at end of the tree = end leaf
#@eye
def to_terminal(group):
	# returns list of classes of group
	outcomes = [row[-1] for row in group]
	# selects most popular class; list of outcomes is reduced to 0 or 1; key counts the amount of times 0 or 1 occurs
	# selects class based on calculating how many times the class occurs
	return max(set(outcomes), key=outcomes.count)

# Counts the amount of unique values in a 'group' (rows in dataset)
#@eye
def count_unique_values (group):
  # Pick classes in the dataset, transform to a set
  # count amount of values
  return len(set([row[-1] for row in group]))

# Create child splits for a node or make terminals/end leafs
# recursive function, it calls itself
# node is dictionary returned by get_split (b_index, b_value, b_groups)
#@eye
def split(node, max_depth, min_size, n_features, depth):
	left, right = node['groups']
	del(node['groups'])
	# check for a no split
	# if one of the groups is empty: left and right are both becoming a number with most popular class of the list
	# decision is prediction whether sample is class 0 or 1
	if not left or not right:
		node['left'] = node['right'] = to_terminal(left + right)
		return
	# check for max depth / how many levels of nodes you want to have
	if depth >= max_depth:
		node['left'], node['right'] = to_terminal(left), to_terminal(right)
		return

	# process left child
	# if length of left group is smaller or equal to 1
	if len(left) <= min_size:
		# it creates an end leaf
		node['left'] = to_terminal(left)
	else:
    # Test here whether the group has only one class
    # if so it can be a terminal
		# it tries again to find best split for the reduced group that is at left of the tree at that moment
		node['left'] = get_split(left, n_features)
		split(node['left'], max_depth, min_size, n_features, depth+1)
	
  
  # process right child
	if len(right) <= min_size:
		node['right'] = to_terminal(right)
	elif count_unique_values(right) == 1:
    # Test here whether the group has only one class
    # if so it can be a terminal
		node['right'] = right[0][-1]
	else:
		node['right'] = get_split(right, n_features)
		# it tries again to find best split for the reduced group that is at right of the tree at that moment
		split(node['right'], max_depth, min_size, n_features, depth+1)
	# return no value because functions are working on the same dictionaries
 
# Build a decision tree
#@eye
def build_tree(train, max_depth, min_size, n_features):
	# root of decision tree is defined by dictionary of index, value, 2 groups (left/right of the split)
	root = get_split(train, n_features)
	split(root, max_depth, min_size, n_features, 1)
	#root is a Node
	# Node: {index: int, value: float, gini: float, left: Node|TerminalNode, right: Node|TerminalNode}
	# TerminalNode: {index: int, value: float, gini: float, left: int(class), right: int (class)}
	return root
 
# Make a prediction with a decision tree
# recursive function as well
#@eye
def predict(node, row):
	# node index = column feature, it looks up value for this feature for this row in dataset
	# compare feature value of row you're checking with feature value of node
	if row[node['index']] < node['value']:
		# is it node? 
		if isinstance(node['left'], dict):
			# recursive function at the left
			return predict(node['left'], row)
		else:
			# creates final leaf at the left
			return node['left']
	else:
		# is it node?
		if isinstance(node['right'], dict):
			# recursive function at the right
			return predict(node['right'], row)
		else:
			# creates final leaf at the left
			return node['right']
 
# Create a random subsample from the dataset with replacement, ratio is called sample_size further on
# This is called BOOTSTRAPPING: build new datasets from the original data, with the same number of rows
# with replacement: after selecting the row we put it back into the data, so it can be selected twice or more
#@eye
def subsample(dataset, ratio):
	sample = list()
	# if it is smaller than 1, not all dataset is taken as sample - he uses the full dataset
	n_sample = round(len(dataset) * ratio)
	while len(sample) < n_sample:
		index = randrange(len(dataset))
		sample.append(dataset[index])
	return sample
 
# Make a prediction with a list of bagged trees
#@eye
def bagging_predict(trees, row):
	# asks the forest to predict class for every row in the test data, this gives list of votes
	predictions = [predict(tree, row) for tree in trees]
	# it calculates amount of votes for each class, returns most popular class as prediction
	return max(set(predictions), key=predictions.count)
 
# Random Forest Algorithm
#@eye
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features, model_path=None):
	trees = list()
	for _ in range(n_trees):
		sample = subsample(train, sample_size)
		# building a tree / root is dictionary with index, value, left/right)
		tree = build_tree(sample, max_depth, min_size, n_features)
		trees.append(tree)

	if model_path:
		with open(model_path, 'w') as outfile:
			json.dump(trees, outfile, indent = 6)
	# prediction using one of the folds we separated in the beginning, forest votes on every row of test data
	predictions = [bagging_predict(trees, row) for row in test]
	# returns votes/predictions of the forest
	return(predictions)

# --------------------------------------
 
# Test the random forest algorithm
seed(2)
# load and prepare data
# filename = 'sonar_csv.csv'
filename = os.path.join(basepath, 'iris_data.csv')
dataset = load_csv(filename)
#print(dataset)
# convert string attributes to integers
for i in range(0, len(dataset[0])-1):
	str_column_to_float(dataset, i)
# convert class column to integers
#lookup = str_column_to_int(dataset, len(dataset[0])-1)
#print(lookup)


# evaluate algorithm
# specifies amounts of subsets for training (-1 for testing)
n_folds = 5
max_depth = 50
min_size = 1
sample_size = 1.0
# it specifies the size of the subset of features for the folds, where the size is close to the square root of the total number of features
n_features = int(sqrt(len(dataset[0])-1))
# it tries forest of 1 tree, 5 trees, 10 trees
  
for n_trees in [1, 5, 10]:
	model_path = os.path.join(basepath, 'random_forest_model_{}-trees.json'.format(n_trees))
	scores = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features, model_path)
	print('Trees: %d' % n_trees)
	print('Scores: %s' % scores)
	print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

#### Note: you also need F1-score & confusion matrix!
#### https://towardsdatascience.com/metrics-to-evaluate-your-machine-learning-algorithm-f10ba6e38234

#### how to use this model now?
#### pickle trees
#### use: unpickle trees + bagging_predict with new data

# with open('random_forest_model.json', 'r') as infile:
# 	trees = json.load(infile)
# 	prediction = bagging_predict(trees, dataset[23])
# 	# this gives a number, you have to reorganise model to get back the string of the class
# 	print(prediction)
Adding the visualizer 2 years ago			`# Using this tutorial : From scratch: https://machinelearningmastery.com/implement-random-forest-scratch-python/`
			`# Explanation on decision trees & random forest: https://pad.constantvzw.org/p/anais_berck_frart_Meise_random_forest`

			`# Random Forest Algorithm on Sonar Dataset`
			`from random import seed`
			`from random import randrange`
			`from csv import reader`
			`from math import sqrt`
			`import json`
Updated readme. Altered scripts to use absolute paths so they can be ran from everywhere and still place exports, or load imports from relative paths. 2 years ago			`import os.path`

Updated generated code. 2 years ago			`#from birdseye import eye`

Updated readme. Altered scripts to use absolute paths so they can be ran from everywhere and still place exports, or load imports from relative paths. 2 years ago			`# Get the directory of the current script to use in importing data`
			`# and exporting the model.`
			`basepath = os.path.dirname(os.path.realpath(__file__))`

Adding the visualizer 2 years ago			`# Load a CSV file. Definition of the function to read the csv and create dataset here`
Updated generated code. 2 years ago			`#@eye`
Adding the visualizer 2 years ago			`def load_csv(filename):`
			`dataset = list()`
			`with open(filename, 'r') as file:`
			`csv_reader = reader(file)`
			`for row in csv_reader:`
			`if not row:`
			`continue`
			`dataset.append(row)`
			`return dataset`

			`# Convert string column to float - original dataset is in string format`
Updated generated code. 2 years ago			`#@eye`
Adding the visualizer 2 years ago			`def str_column_to_float(dataset, column):`
			`for row in dataset:`
			`row[column] = float(row[column].strip())`

			`# Convert string column to integer / transforms classes 'mine' and 'rock' into 1 and 2`
Updated generated code. 2 years ago			`#@eye`
Adding the visualizer 2 years ago			`def str_column_to_int(dataset, column):`
			`# extracts values of the classes of the dataset: array of all the mine, rock`
			`class_values = [row[column] for row in dataset]`
			`# it transforms array into a set of 2 strings, mine and rock`
			`unique = set(class_values)`
			`# create dictionary`
			`lookup = dict()`
			`# loops through the set and creates dictionary with key (value: rock or mine) and the value is a number`
			`# destruct tuple by using enumerate with 2 variables i, value - useful if you want to get indexes from a list`
			`# loops through the set / enumerate gives you a tuple with an index number and a value /common way to get indexes from a list`
			`for i, value in enumerate(unique):`
			`lookup[value] = i # the key of the dictonnary is the value: mine or rock; and the value is a number: 0 or 1`
			`# loops through rows of dataset, replaces the name of class by number/index value`
			`for row in dataset:`
			`row[column] = lookup[row[column]]`
			`# code returns lookup table`
			`return lookup`

			`# Split a dataset into k folds`
Updated generated code. 2 years ago			`#@eye`
Adding the visualizer 2 years ago			`def cross_validation_split(dataset, n_folds):`
			`# creates list`
			`dataset_split = list()`
			`# copies the dataset (which is a list) using the list function`
			`dataset_copy = list(dataset)`
			`# size of the fold = length of dataset / amount of folds (ex. 5)`
			`fold_size = int(len(dataset) / n_folds)`
			`# loops through list of numbers from 0 up to n_fold`
			`for i in range(n_folds):`
			`fold = list()`
			`# as long as length of list is inferior to defined fold size`
			`while len(fold) < fold_size:`
			`# generates random integer between 0 and total length of dataset and store it in index`
			`index = randrange(len(dataset_copy))`
			`# pop removes element from list and returns the value, adds value to fold`
			`fold.append(dataset_copy.pop(index))`
			`# adds fold to list of folds, each fold has different observations`
			`dataset_split.append(fold)`
			`return dataset_split #return the dataset_split, a list of folds`

			`# Calculate accuracy percentage`
Updated generated code. 2 years ago			`#@eye`
Adding the visualizer 2 years ago			`def accuracy_metric(actual, predicted):`
			`correct = 0`
			`# loops through index list which has length of actual classes`
			`for i in range(len(actual)):`
			`# compares values on same positions in list of actual classes with list of predicted classes`
			`if actual[i] == predicted[i]:`
			`# adds one to correct variable to count number of correct guesses`
			`correct += 1`
			`# gives percentage by dividing correct guesses by length of actual classes divided by 100`
			`return correct / float(len(actual)) * 100.0`

			`# Evaluate an algorithm using a cross validation split`
Updated generated code. 2 years ago			`#@eye`
Adding the visualizer 2 years ago			`def evaluate_algorithm(dataset, algorithm, n_folds, *args):`
			`# split dataset in n folds`
			`folds = cross_validation_split(dataset, n_folds)`
			`# creates list called scores`
			`scores = list()`
			`for fold in folds:`
			`# creates copy of dataset_split, list of lists`
			`train_set = list(folds)`
			`# removes 1 fold for testing`
			`train_set.remove(fold)`
			`# concatenates all lists of remaining folds into 1 list`
			`train_set = sum(train_set, [])`
			`# creates a list called test_set`
			`test_set = list()`
			`# iterates through test data`
			`for row in fold:`
			`# makes copy of the row`
			`row_copy = list(row)`
			`# appends copy to a new list called test set`
			`test_set.append(row_copy)`
			`# changes last element in the row, which is the class, to remove the number and set it to None`
			`row_copy[-1] = None`
			`# he specifies randomforest as an algorithm, which is also a function; this line is`
			`# generating the model based on trainset, makes prediction of classes for testset`
			`predicted = algorithm(train_set, test_set, *args)`
			`# list comprehension: gives list of actual classes in fold`
			`actual = [row[-1] for row in fold]`
			`# compares actual classes to predicted classes to give idea of accuracy of function`
			`accuracy = accuracy_metric(actual, predicted)`
			`# creates a list of scores based on comparison`
			`scores.append(accuracy)`
			`return scores`

			`# --------`


			`# Split a dataset based on a feature and a feature value defined in build tree`
			`# just trying many times, benefitting from speed of computer`
Updated generated code. 2 years ago			`#@eye`
Adding the visualizer 2 years ago			`def test_split(index, value, dataset):`
			`left, right = list(), list()`
			`for row in dataset:`
			`# compares set value to all values in that column, if it is smaller, it goes to the left`
			`# he goes for each value through all dataset again`
			`if row[index] < value:`
			`left.append(row)`
			`# comparing the set value to itself, then it goes to the right`
			`else:`
			`right.append(row)`
			`return left, right`

			`# Calculate the Gini index for a split dataset, using left/right og test split as groups`
			`# cfr calculating wealth distribution: https://en.wikipedia.org/wiki/Gini_coefficient`
Updated generated code. 2 years ago			`#@eye`
Adding the visualizer 2 years ago			`def gini_index(groups, classes):`
			`# count all samples at split point (the dataset), converts it in a float in order to do divisions`
			`n_instances = float(sum([len(group) for group in groups]))`
			`# sum weighted Gini index for each group`
			`gini = 0.0`
			`for group in groups:`
			`size = float(len(group))`
			`# avoid divide by zero`
			`if size == 0:`
			`continue`
			`score = 0.0`
			`# score the group based on the score for each class`
			`# count number of instances for current class in the group and divide by total size of the group`
			`for class_val in classes:`
			`# outcome lies always between 0 and 1`
			`# for each row it takes the class value and counts how many times the set class value appears, divided by size of the group`
			`p = [row[-1] for row in group].count(class_val) / size`
			`# multiply makes it exponentially smaller; you amplify the badness of the score`
			`score += p * p`
			`# weight the group score by its relative size (size of group divided by total size of dataset)`
			`gini += (1.0 - score) * (size / n_instances)`
			`return gini`

			`# Select the best split point for a dataset`
Updated generated code. 2 years ago			`#@eye`
Adding the visualizer 2 years ago			`def get_split(dataset, n_features):`
			`# takes last element of each row (class) and returns it as a row, as it is a set, it has only 2 values`
			`class_values = list(set(row[-1] for row in dataset))`
			`# assigning values to variables`
			`b_index, b_value, b_score, b_groups = 999, 999, 999, None`
			`# creates list called features`
			`features = list()`
			`# as long as features list is not as long as square root of total dataset`
			`while len(features) < n_features:`
			`# creates number between 0 and nr of colums (- class)`
			`index = randrange(len(dataset[0])-1)`
			`# add column value if not present yet in features, creates only the index with name of the column`
			`if index not in features:`
			`features.append(index)`
			`# for each column name in list features:`
			`for index in features:`
			`for row in dataset:`
			`# take split point, loops through all the points, selecting 1 feature`
			`groups = test_split(index, row[index], dataset)`
			`# calculates how 'pure' the split is, whether it gives 2 groups that correspond to 2 classes`
			`gini = gini_index(groups, class_values)`
			`# the lower the score is, the better / keep the best score`
			`# you keep reference to current best option`
			`if gini < b_score:`
			`b_index, b_value, b_score, b_groups = index, row[index], gini, groups`
			`# returns a dictionary`
			`return {'index':b_index, 'value':b_value, 'groups':b_groups, 'gini':b_score}`

			`# Create a terminal node value = node at end of the tree = end leaf`
Updated generated code. 2 years ago			`#@eye`
Adding the visualizer 2 years ago			`def to_terminal(group):`
			`# returns list of classes of group`
			`outcomes = [row[-1] for row in group]`
			`# selects most popular class; list of outcomes is reduced to 0 or 1; key counts the amount of times 0 or 1 occurs`
			`# selects class based on calculating how many times the class occurs`
			`return max(set(outcomes), key=outcomes.count)`

			`# Counts the amount of unique values in a 'group' (rows in dataset)`
Updated generated code. 2 years ago			`#@eye`
Adding the visualizer 2 years ago			`def count_unique_values (group):`
			`# Pick classes in the dataset, transform to a set`
			`# count amount of values`
			`return len(set([row[-1] for row in group]))`

			`# Create child splits for a node or make terminals/end leafs`
			`# recursive function, it calls itself`
			`# node is dictionary returned by get_split (b_index, b_value, b_groups)`
Updated generated code. 2 years ago			`#@eye`
Adding the visualizer 2 years ago			`def split(node, max_depth, min_size, n_features, depth):`
			`left, right = node['groups']`
			`del(node['groups'])`
			`# check for a no split`
			`# if one of the groups is empty: left and right are both becoming a number with most popular class of the list`
			`# decision is prediction whether sample is class 0 or 1`
			`if not left or not right:`
			`node['left'] = node['right'] = to_terminal(left + right)`
			`return`
			`# check for max depth / how many levels of nodes you want to have`
			`if depth >= max_depth:`
			`node['left'], node['right'] = to_terminal(left), to_terminal(right)`
			`return`

			`# process left child`
			`# if length of left group is smaller or equal to 1`
			`if len(left) <= min_size:`
			`# it creates an end leaf`
			`node['left'] = to_terminal(left)`
			`else:`
			`# Test here whether the group has only one class`
			`# if so it can be a terminal`
			`# it tries again to find best split for the reduced group that is at left of the tree at that moment`
			`node['left'] = get_split(left, n_features)`
			`split(node['left'], max_depth, min_size, n_features, depth+1)`


			`# process right child`
			`if len(right) <= min_size:`
			`node['right'] = to_terminal(right)`
			`elif count_unique_values(right) == 1:`
			`# Test here whether the group has only one class`
			`# if so it can be a terminal`
			`node['right'] = right[0][-1]`
			`else:`
			`node['right'] = get_split(right, n_features)`
			`# it tries again to find best split for the reduced group that is at right of the tree at that moment`
			`split(node['right'], max_depth, min_size, n_features, depth+1)`
			`# return no value because functions are working on the same dictionaries`

			`# Build a decision tree`
Updated generated code. 2 years ago			`#@eye`
Adding the visualizer 2 years ago			`def build_tree(train, max_depth, min_size, n_features):`
			`# root of decision tree is defined by dictionary of index, value, 2 groups (left/right of the split)`
			`root = get_split(train, n_features)`
			`split(root, max_depth, min_size, n_features, 1)`
			`#root is a Node`
			`# Node: {index: int, value: float, gini: float, left: Node\|TerminalNode, right: Node\|TerminalNode}`
			`# TerminalNode: {index: int, value: float, gini: float, left: int(class), right: int (class)}`
			`return root`

			`# Make a prediction with a decision tree`
			`# recursive function as well`
Updated generated code. 2 years ago			`#@eye`
Adding the visualizer 2 years ago			`def predict(node, row):`
			`# node index = column feature, it looks up value for this feature for this row in dataset`
			`# compare feature value of row you're checking with feature value of node`
			`if row[node['index']] < node['value']:`
			`# is it node?`
			`if isinstance(node['left'], dict):`
			`# recursive function at the left`
			`return predict(node['left'], row)`
			`else:`
			`# creates final leaf at the left`
			`return node['left']`
			`else:`
			`# is it node?`
			`if isinstance(node['right'], dict):`
			`# recursive function at the right`
			`return predict(node['right'], row)`
			`else:`
			`# creates final leaf at the left`
			`return node['right']`

			`# Create a random subsample from the dataset with replacement, ratio is called sample_size further on`
			`# This is called BOOTSTRAPPING: build new datasets from the original data, with the same number of rows`
			`# with replacement: after selecting the row we put it back into the data, so it can be selected twice or more`
Updated generated code. 2 years ago			`#@eye`
Adding the visualizer 2 years ago			`def subsample(dataset, ratio):`
			`sample = list()`
			`# if it is smaller than 1, not all dataset is taken as sample - he uses the full dataset`
			`n_sample = round(len(dataset) * ratio)`
			`while len(sample) < n_sample:`
			`index = randrange(len(dataset))`
			`sample.append(dataset[index])`
			`return sample`

			`# Make a prediction with a list of bagged trees`
Updated generated code. 2 years ago			`#@eye`
Adding the visualizer 2 years ago			`def bagging_predict(trees, row):`
			`# asks the forest to predict class for every row in the test data, this gives list of votes`
			`predictions = [predict(tree, row) for tree in trees]`
			`# it calculates amount of votes for each class, returns most popular class as prediction`
			`return max(set(predictions), key=predictions.count)`

			`# Random Forest Algorithm`
Updated generated code. 2 years ago			`#@eye`
Updated readme. Altered scripts to use absolute paths so they can be ran from everywhere and still place exports, or load imports from relative paths. 2 years ago			`def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features, model_path=None):`
Adding the visualizer 2 years ago			`trees = list()`
Updated readme. Altered scripts to use absolute paths so they can be ran from everywhere and still place exports, or load imports from relative paths. 2 years ago			`for _ in range(n_trees):`
Adding the visualizer 2 years ago			`sample = subsample(train, sample_size)`
			`# building a tree / root is dictionary with index, value, left/right)`
			`tree = build_tree(sample, max_depth, min_size, n_features)`
			`trees.append(tree)`

Updated readme. Altered scripts to use absolute paths so they can be ran from everywhere and still place exports, or load imports from relative paths. 2 years ago			`if model_path:`
			`with open(model_path, 'w') as outfile:`
			`json.dump(trees, outfile, indent = 6)`
Adding the visualizer 2 years ago			`# prediction using one of the folds we separated in the beginning, forest votes on every row of test data`
			`predictions = [bagging_predict(trees, row) for row in test]`
			`# returns votes/predictions of the forest`
			`return(predictions)`

			`# --------------------------------------`

			`# Test the random forest algorithm`
			`seed(2)`
			`# load and prepare data`
			`# filename = 'sonar_csv.csv'`
Updated readme. Altered scripts to use absolute paths so they can be ran from everywhere and still place exports, or load imports from relative paths. 2 years ago			`filename = os.path.join(basepath, 'iris_data.csv')`
Adding the visualizer 2 years ago			`dataset = load_csv(filename)`
			`#print(dataset)`
			`# convert string attributes to integers`
			`for i in range(0, len(dataset[0])-1):`
			`str_column_to_float(dataset, i)`
			`# convert class column to integers`
			`#lookup = str_column_to_int(dataset, len(dataset[0])-1)`
			`#print(lookup)`


			`# evaluate algorithm`
			`# specifies amounts of subsets for training (-1 for testing)`
			`n_folds = 5`
			`max_depth = 50`
			`min_size = 1`
			`sample_size = 1.0`
			`# it specifies the size of the subset of features for the folds, where the size is close to the square root of the total number of features`
			`n_features = int(sqrt(len(dataset[0])-1))`
			`# it tries forest of 1 tree, 5 trees, 10 trees`
Updated readme. Altered scripts to use absolute paths so they can be ran from everywhere and still place exports, or load imports from relative paths. 2 years ago
Adding the visualizer 2 years ago			`for n_trees in [1, 5, 10]:`
Updated readme. Altered scripts to use absolute paths so they can be ran from everywhere and still place exports, or load imports from relative paths. 2 years ago			`model_path = os.path.join(basepath, 'random_forest_model_{}-trees.json'.format(n_trees))`
			`scores = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features, model_path)`
Adding the visualizer 2 years ago			`print('Trees: %d' % n_trees)`
			`print('Scores: %s' % scores)`
			`print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))`

			`#### Note: you also need F1-score & confusion matrix!`
			`#### https://towardsdatascience.com/metrics-to-evaluate-your-machine-learning-algorithm-f10ba6e38234`

			`#### how to use this model now?`
			`#### pickle trees`
			`#### use: unpickle trees + bagging_predict with new data`

Updated readme. Altered scripts to use absolute paths so they can be ran from everywhere and still place exports, or load imports from relative paths. 2 years ago			`# with open('random_forest_model.json', 'r') as infile:`
			`# trees = json.load(infile)`
			`# prediction = bagging_predict(trees, dataset[23])`
			`# # this gives a number, you have to reorganise model to get back the string of the class`
			`# print(prediction)`