You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

669 lines
24 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Random Forest"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Setting the space\n",
"### importing Librairies"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"from random import seed\n",
"from random import randrange\n",
"from csv import reader\n",
"from math import sqrt\n",
"import json\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load a CSV file. \n",
"\n",
"Definition of the function to read the csv and create dataset"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"def load_csv(filename):\n",
"\tdataset = list()\n",
"\twith open(filename, 'r') as file:\n",
"\t\tcsv_reader = reader(file)\n",
"\t\tfor row in csv_reader:\n",
"\t\t\tif not row:\n",
"\t\t\t\tcontinue\n",
"\t\t\tdataset.append(row)\n",
"\treturn dataset\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Preparing the data\n",
"### Conversion of certain data\n",
"extract the values of the column (here types of Iris)\n",
"calculate how many unique class values there are and store them into a set: a list with unique values\n",
"Tranform class values into numbers/integers"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"# Convert string column to float\n",
"def str_column_to_float(dataset, column):\n",
"\tfor row in dataset:\n",
"\t\trow[column] = float(row[column].strip())\n",
"\n",
"# Convert string column to integer\n",
"def str_column_to_int(dataset, column):\n",
"\tclass_values = [row[column] for row in dataset] # extract the values of the column (here the classes of the dataset, mine and rocks)\n",
"\tunique = set(class_values) # calculate how many unique class values there are and store them into a set: a list with unique values\n",
"\tlookup = dict() #create a dictionnary\n",
"\tfor i, value in enumerate(unique): # loops through the set / enumerate gives you a tuple with an index number and a value /common way to get indexes from a list\n",
"\t\tlookup[value] = i # the key of the dictonnary is the value: mine or rock/or types of iris; and the value is a number: 0 or 1 (or 2)\n",
"\tfor row in dataset: # loops through the rows of the dataset\n",
"\t\trow[column] = lookup[row[column]] #replaces the value of the column: rock or mine/or types of iris, with the index value: 0 or (1 or 2);\n",
"\treturn lookup # the code returns the lookup table\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Function to create a list of folds (divide the dataset into smaller subsets)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"# Split a dataset into k folds\n",
"def cross_validation_split(dataset, n_folds):\n",
"\tdataset_split = list() #create a list \n",
"\tdataset_copy = list(dataset) # creates a list of the dataset. You could use the copy library\n",
"\tfold_size = int(len(dataset) / n_folds) # the size of the fold is equal to the length of the dataset divided by the amount of folds\n",
"\tprint(\"fold size:\")\n",
"\tprint(fold_size) \n",
"\tfor i in range(n_folds): #loops the amount of folds : generate another list with numbers from 0 up to n_fold\n",
"\t\tfold = list() #create a list\n",
"\t\twhile len(fold) < fold_size: # as long as the length of the list is inferior to the defined fold size\n",
"\t\t\tindex = randrange(len(dataset_copy)) # return a random integer between 0 and the total length of the dataset and store it in index\n",
"\t\t\tfold.append(dataset_copy.pop(index)) # append an observation at the index and removes it from the dataset\n",
"\t\tdataset_split.append(fold) # append the fold to the list dataset_split\n",
"\t\t#print(\"______________\")\n",
"\t\t#print(\"dataset split:\")\n",
"\t\t#print(dataset_split)\n",
"\t\t#print(\"______________\")\n",
"\treturn dataset_split #return the dataset_split, a list of folds\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Functions definitions\n",
"### Calculate accuracy in the prediction"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"# Calculate accuracy percentage\n",
"def accuracy_metric(actual, predicted):\n",
"\tcorrect = 0 #create a correct variable\n",
"\tfor i in range(len(actual)): # loops up to the length of the actual list\n",
"\t\tif actual[i] == predicted[i]: # compares the actual vs the predicted\n",
"\t\t\tcorrect += 1 #if correct add one to the correct variable. Count the number of correct guesses\n",
"\treturn correct / float(len(actual)) * 100.0 #gives a percentage by dividing the correct guesses by the length of the actual classes and multiply by a hundred\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a list of score for each algorithm/tree"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"# Evaluate an algorithm using a cross validation split\n",
"def evaluate_algorithm(dataset, algorithm, n_folds, *args):\n",
"\tfolds = cross_validation_split(dataset, n_folds) #return the list of folds\n",
"\tscores = list() #creates a list called scores\n",
"\tfor fold in folds: #loops through the folds\n",
"\t\ttrain_set = list(folds) #creates a copy of the list of folds \n",
"\t\ttrain_set.remove(fold) #remove one fold: for testing?\n",
"\t\ttrain_set = sum(train_set, []) # concatenate all folds, a list of lists, into one list. can be done with itertools.chain\n",
"\t\ttest_set = list() # create another list\n",
"\t\tfor row in fold: # iterates through fold\n",
"\t\t\trow_copy = list(row) # creates a copy of the list\n",
"\t\t\ttest_set.append(row_copy) # append the list to the test_set\n",
"\t\t\trow_copy[-1] = None # set the classification to none. Changes the last column to none.\n",
"\t\tpredicted = algorithm(train_set, test_set, *args) # what is doing the prediction takes for argument the train and test set and returns prediction for the test set\n",
"\t\tactual = [row[-1] for row in fold] # list comprehension: list of actual classes from fold.\n",
"\t\taccuracy = accuracy_metric(actual, predicted) # function that compares the actual vs the predicted to give an idea of the accuracy of the prediction\n",
"\t\tscores.append(accuracy) #append the accuracy to the list of scores\n",
"\treturn scores"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create the trees by using different features and figuring out where to split the data at each point"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here the algorithms is running the same function many times to understand where is the best point to divide the best split point for the dataset. In order to assess it, it uses a coefficient called the Gini Coefficient.\n",
"There are three functions:\n",
"- the function for dividing the data into two based on a feature\n",
"- the function to assess whether this division is resulting in an equal divide and that return a coefficient\n",
"- the function to decide between all the different dividing point, which one is the best, which one result in the best gini coefficient"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"# Split a dataset based on a feature and a feature value defined in build tree\n",
"# just trying many times, benefitting from speed of computer\n",
"def test_split(index, value, dataset):\n",
"\tleft, right = list(), list()\n",
"\tfor row in dataset:\n",
"\t\t# compares set value to all values in that column, if it is smaller, it goes to the left\n",
"\t\t# he goes for each value through all dataset again\n",
"\t\tif row[index] < value:\n",
"\t\t\tleft.append(row)\n",
"\t\t# comparing the set value to itself, then it goes to the right\n",
"\t\telse:\n",
"\t\t\tright.append(row)\n",
"\treturn left, right"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Calculate the Gini index for a split dataset, using left/right og test split as groups\n",
"# cfr calculating wealth distribution: https://en.wikipedia.org/wiki/Gini_coefficient\n",
"def gini_index(groups, classes):\n",
"\t# count all samples at split point (the dataset), converts it in a float in order to do divisions\n",
"\tn_instances = float(sum([len(group) for group in groups]))\n",
"\t# sum weighted Gini index for each group\n",
"\tgini = 0.0\n",
"\tfor group in groups:\n",
"\t\tsize = float(len(group))\n",
"\t\t# avoid divide by zero\n",
"\t\tif size == 0:\n",
"\t\t\tcontinue\n",
"\t\tscore = 0.0\n",
"\t\t# score the group based on the score for each class\n",
"\t\t# count number of instances for current class in the group and divide by total size of the group\n",
"\t\tfor class_val in classes:\n",
"\t\t\t# outcome lies always between 0 and 1\n",
"\t\t\t# for each row it takes the class value and counts how many times the set class value appears, divided by size of the group\n",
"\t\t\tp = [row[-1] for row in group].count(class_val) / size\n",
"\t\t\t# multiply makes it exponentially smaller; you amplify the badness of the score\n",
"\t\t\tscore += p * p\n",
"\t\t# weight the group score by its relative size (size of group divided by total size of dataset)\n",
"\t\tgini += (1.0 - score) * (size / n_instances)\n",
"\treturn gini\n",
"\n",
"\n",
"# Select the best split point for a dataset\n",
"def get_split(dataset, n_features):\n",
"\t# takes last element of each row (class) and returns it as a row, as it is a set, it has only 2 values\n",
"\tclass_values = list(set(row[-1] for row in dataset))\n",
"\t# assigning values to variables\n",
"\tb_index, b_value, b_score, b_groups = 999, 999, 999, None\n",
"\t# creates list called features\n",
"\tfeatures = list()\n",
"\t# as long as features list is not as long as square root of total dataset\n",
"\twhile len(features) < n_features:\n",
"\t\t# creates number between 0 and nr of colums (- class)\n",
"\t\tindex = randrange(len(dataset[0])-1)\n",
"\t\t# add column value if not present yet in features, creates only the index with name of the column\n",
"\t\tif index not in features:\n",
"\t\t\tfeatures.append(index)\n",
"\t# for each column name in list features:\n",
"\tfor index in features:\n",
"\t\tfor row in dataset:\n",
"\t\t\t# take split point, loops through all the points, selecting 1 feature\n",
"\t\t\tgroups = test_split(index, row[index], dataset)\n",
"\t\t\t# calculates how 'pure' the split is, whether it gives 2 groups that correspond to 2 classes\n",
"\t\t\tgini = gini_index(groups, class_values)\n",
"\t\t\t# the lower the score is, the better / keep the best score\n",
"\t\t\t# you keep reference to current best option\n",
"\t\t\tif gini < b_score:\n",
"\t\t\t\tb_index, b_value, b_score, b_groups = index, row[index], gini, groups\n",
"\t# returns a dictionary\n",
"\treturn {'index':b_index, 'value':b_value, 'groups':b_groups, 'gini':b_score}\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create the child splits or terminal nodes\n",
"\n",
"here we define a function that will build the tree, decide wether the new data point is going to go left or right or to build a new node."
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Create a terminal node value = node at end of the tree = end leaf with its predicted class\n",
"def to_terminal(group):\n",
"\t# returns list of classes of group\n",
"\toutcomes = [row[-1] for row in group]\n",
"\t# selects most popular class; list of outcomes is reduced to 0 or 1; key counts the amount of times 0 or 1 occurs\n",
"\t# selects class based on calculating how many times the class occurs\n",
"\treturn max(set(outcomes), key=outcomes.count)\n",
"\n",
"\n",
"# Counts the amount of unique values in a 'group' (rows in dataset)\n",
"def count_unique_values (group):\n",
" # Pick classes in the dataset, transform to a set\n",
" # count amount of values\n",
" return len(set([row[-1] for row in group]))"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"# Create child splits for a node or make terminals/end leafs\n",
"# recursive function, it calls itself\n",
"# node is dictionary returned by get_split (b_index, b_value, b_groups)\n",
"def split(node, max_depth, min_size, n_features, depth):\n",
"\tleft, right = node['groups']\n",
"\tdel(node['groups'])\n",
"\t# check for a no split\n",
"\t# if one of the groups is empty: left and right are both becoming a number with most popular class of the list\n",
"\t# decision is prediction whether sample is class 0 or 1\n",
"\tif not left or not right:\n",
"\t\tnode['left'] = node['right'] = to_terminal(left + right)\n",
"\t\treturn\n",
"\t# check for max depth / how many levels of nodes you want to have\n",
"\tif depth >= max_depth:\n",
"\t\tnode['left'], node['right'] = to_terminal(left), to_terminal(right)\n",
"\t\treturn\n",
" \n",
" \n",
"\t# process left child\n",
"\t# if length of left group is smaller or equal to 1\n",
"\tif len(left) <= min_size:\n",
"\t\t# it creates an end leaf\n",
"\t\tnode['left'] = to_terminal(left)\n",
"\telse:\n",
" # Test here whether the group has only one class\n",
" # if so it can be a terminal\n",
"\t\t# it tries again to find best split for the reduced group that is at left of the tree at that moment\n",
"\t\tnode['left'] = get_split(left, n_features)\n",
"\t\tsplit(node['left'], max_depth, min_size, n_features, depth+1)\n",
"\t\n",
" \n",
" # process right child\n",
"\tif len(right) <= min_size:\n",
"\t\tnode['right'] = to_terminal(right)\n",
"\telif count_unique_values(right) == 1:\n",
" # Test here whether the group has only one class\n",
" # if so it can be a terminal\n",
"\t\tnode['right'] = right[0][-1]\n",
"\telse:\n",
"\t\tnode['right'] = get_split(right, n_features)\n",
"\t\t# it tries again to find best split for the reduced group that is at right of the tree at that moment\n",
"\t\tsplit(node['right'], max_depth, min_size, n_features, depth+1)\n",
"\t# return no value because functions are working on the same dictionaries\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create the decision tree"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"def build_tree(train, max_depth, min_size, n_features):\n",
"\t# root of decision tree is defined by dictionary of index, value, 2 groups (left/right of the split)\n",
"\troot = get_split(train, n_features)\n",
"\tsplit(root, max_depth, min_size, n_features, 1)\n",
"\t#root is a Node\n",
"\t# Node: {index: int, value: float, gini: float, left: Node|TerminalNode, right: Node|TerminalNode}\n",
"\t# TerminalNode: {index: int, value: float, gini: float, left: int(class), right: int (class)}\n",
"\treturn root"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Run the prediction for a tree"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"# Make a prediction with a decision tree\n",
"# recursive function as well\n",
"def predict(node, row):\n",
"\t# node index = column feature, it looks up value for this feature for this row in dataset\n",
"\t# compare feature value of row you're checking with feature value of node\n",
"\tif row[node['index']] < node['value']:\n",
"\t\t# is it node? \n",
"\t\tif isinstance(node['left'], dict):\n",
"\t\t\t# recursive function at the left\n",
"\t\t\treturn predict(node['left'], row)\n",
"\t\telse:\n",
"\t\t\t# creates final leaf at the left\n",
"\t\t\treturn node['left']\n",
"\telse:\n",
"\t\t# is it node?\n",
"\t\tif isinstance(node['right'], dict):\n",
"\t\t\t# recursive function at the right\n",
"\t\t\treturn predict(node['right'], row)\n",
"\t\telse:\n",
"\t\t\t# creates final leaf at the left\n",
"\t\t\treturn node['right']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Bootstrapping: doubling data to fill-in the random forest"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"# Create a random subsample from the dataset with replacement, ratio is called sample_size further on\n",
"# This is called BOOTSTRAPPING: build new datasets from the original data, with the same number of rows\n",
"# with replacement: after selecting the row we put it back into the data, so it can be selected twice or more\n",
"def subsample(dataset, ratio):\n",
"\tsample = list()\n",
"\t# if it is smaller than 1, not all dataset is taken as sample - he uses the full dataset\n",
"\tn_sample = round(len(dataset) * ratio)\n",
"\twhile len(sample) < n_sample:\n",
"\t\tindex = randrange(len(dataset))\n",
"\t\tsample.append(dataset[index])\n",
"\treturn sample\n",
" \n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Aggregate the prediction of several trees - the council of trees - see what is their verdict"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"# Make a prediction with a list of bagged trees\n",
"def bagging_predict(trees, row):\n",
"\t# asks the forest to predict class for every row in the test data, this gives list of votes\n",
"\tpredictions = [predict(tree, row) for tree in trees]\n",
"\t# it calculates amount of votes for each class, returns most popular class as prediction\n",
"\treturn max(set(predictions), key=predictions.count)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### The random forest algorithm, encompassing the one above"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Random Forest Algorithm\n",
"def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):\n",
"\ttrees = list() #create a list of trees\n",
"\tfor i in range(n_trees): \n",
"\t\tsample = subsample(train, sample_size) # create subsamples \n",
"\t\ttree = build_tree(sample, max_depth, min_size, n_features) #build the tree\n",
"\t\ttrees.append(tree)#append the list \n",
"\twith open(\"model.json\", \"w\") as out_file: \n",
"\t\tjson.dump(trees, out_file, indent = 6)\n",
"\tpredictions = [bagging_predict(trees, row) for row in test] # testing with test data, running the prediction on every row. THE FOREST VOTES ON EVERY ROW\n",
"\treturn(predictions) # we return the predicted class of each of the rows as a list. THE PREDICTIONS OF THE FOREST\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Running the random forest with our data"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"from random import random"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"# Test the random forest algorithm\n",
"seed(2) #put the random generator in a certain state -> makes it deterministic otherwise python takes the time of the computer\n",
"#print(random())\n",
"#print(random())\n",
"#seed(2)\n",
"#print(random())\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"# load and prepare data\n",
"filename = 'iris_data.csv'\n",
"#filename = 'iris.csv'\n",
"\n",
"dataset = load_csv(filename)\n",
"#print(dataset)\n"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"# convert string attributes to integers\n",
"for i in range(0, len(dataset[0])-1):\n",
"\tstr_column_to_float(dataset, i)\n",
"# convert class column to integers\n",
"#str_column_to_int(dataset, len(dataset[0])-1) #this function loops through the rows and transforms the words mine and roch into 1 and 0\n",
"#print(dataset)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"# evaluate algorithm\n",
"n_folds = 5 #the data is randomly sampled into 5 subsamples, one is kept for testing the 4 else are used for training.\n",
"max_depth = 10 #\n",
"min_size = 1\n",
"sample_size = 1.0 #fixed, can be changed to reduces the amount of subsampling, if it is smaller than one.\n"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"fold size:\n",
"30\n",
"Trees: 1\n",
"Scores: [100.0, 93.33333333333333, 96.66666666666667, 100.0, 90.0]\n",
"Mean Accuracy: 96.000%\n",
"fold size:\n",
"30\n",
"Trees: 5\n",
"Scores: [96.66666666666667, 93.33333333333333, 90.0, 100.0, 96.66666666666667]\n",
"Mean Accuracy: 95.333%\n",
"fold size:\n",
"30\n",
"Trees: 10\n",
"Scores: [96.66666666666667, 90.0, 93.33333333333333, 96.66666666666667, 96.66666666666667]\n",
"Mean Accuracy: 94.667%\n"
]
}
],
"source": [
"n_features = int(sqrt(len(dataset[0])-1)) #it specifies the size of the feature subset for the folds, where the size is close to the square root of the total number of features\n",
"for n_trees in [1, 5, 10]: # will loop three times by taking n_trees equals one, n_trees equals five and n_trees equals ten successively.\n",
"\tscores = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features)\n",
"\tprint('Trees: %d' % n_trees)\n",
"\tprint('Scores: %s' % scores)\n",
"\tprint('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### checking the model with a specific instance"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Iris Versicolour\n"
]
}
],
"source": [
"with open (\"model.json\", \"r\") as in_file:\n",
"\ttrees=json.load(in_file)\n",
"\tprediction=bagging_predict(trees,dataset[55])\n",
"\tprint(prediction)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}