Updated generated code.

main
Gijs 3 years ago
parent 8e132e035a
commit 93b14bba40

@ -36,9 +36,29 @@ def encode_tree(tree):
def make_classifier (tree): def make_classifier (tree):
lines = [ lines = [
"char predict(float *r) {", "#pragma once",
"#include <cstdarg>",
"namespace PublishingHouse",
"{",
*map(indent_line, [
"namespace RandomForest",
"{",
*map(indent_line, [
"class DecisionTree",
"{",
"public:",
*map(indent_line, [
"char* predict(float *r)",
"{",
*encode_tree(tree), *encode_tree(tree),
"}",
]),
"private:",
"};"
]),
"}" "}"
]),
"}",
] ]
return('\n'.join(lines)) return('\n'.join(lines))

@ -9,11 +9,14 @@ from math import sqrt
import json import json
import os.path import os.path
#from birdseye import eye
# Get the directory of the current script to use in importing data # Get the directory of the current script to use in importing data
# and exporting the model. # and exporting the model.
basepath = os.path.dirname(os.path.realpath(__file__)) basepath = os.path.dirname(os.path.realpath(__file__))
# Load a CSV file. Definition of the function to read the csv and create dataset here # Load a CSV file. Definition of the function to read the csv and create dataset here
#@eye
def load_csv(filename): def load_csv(filename):
dataset = list() dataset = list()
with open(filename, 'r') as file: with open(filename, 'r') as file:
@ -25,11 +28,13 @@ def load_csv(filename):
return dataset return dataset
# Convert string column to float - original dataset is in string format # Convert string column to float - original dataset is in string format
#@eye
def str_column_to_float(dataset, column): def str_column_to_float(dataset, column):
for row in dataset: for row in dataset:
row[column] = float(row[column].strip()) row[column] = float(row[column].strip())
# Convert string column to integer / transforms classes 'mine' and 'rock' into 1 and 2 # Convert string column to integer / transforms classes 'mine' and 'rock' into 1 and 2
#@eye
def str_column_to_int(dataset, column): def str_column_to_int(dataset, column):
# extracts values of the classes of the dataset: array of all the mine, rock # extracts values of the classes of the dataset: array of all the mine, rock
class_values = [row[column] for row in dataset] class_values = [row[column] for row in dataset]
@ -49,6 +54,7 @@ def str_column_to_int(dataset, column):
return lookup return lookup
# Split a dataset into k folds # Split a dataset into k folds
#@eye
def cross_validation_split(dataset, n_folds): def cross_validation_split(dataset, n_folds):
# creates list # creates list
dataset_split = list() dataset_split = list()
@ -70,6 +76,7 @@ def cross_validation_split(dataset, n_folds):
return dataset_split #return the dataset_split, a list of folds return dataset_split #return the dataset_split, a list of folds
# Calculate accuracy percentage # Calculate accuracy percentage
#@eye
def accuracy_metric(actual, predicted): def accuracy_metric(actual, predicted):
correct = 0 correct = 0
# loops through index list which has length of actual classes # loops through index list which has length of actual classes
@ -82,6 +89,7 @@ def accuracy_metric(actual, predicted):
return correct / float(len(actual)) * 100.0 return correct / float(len(actual)) * 100.0
# Evaluate an algorithm using a cross validation split # Evaluate an algorithm using a cross validation split
#@eye
def evaluate_algorithm(dataset, algorithm, n_folds, *args): def evaluate_algorithm(dataset, algorithm, n_folds, *args):
# split dataset in n folds # split dataset in n folds
folds = cross_validation_split(dataset, n_folds) folds = cross_validation_split(dataset, n_folds)
@ -120,6 +128,7 @@ def evaluate_algorithm(dataset, algorithm, n_folds, *args):
# Split a dataset based on a feature and a feature value defined in build tree # Split a dataset based on a feature and a feature value defined in build tree
# just trying many times, benefitting from speed of computer # just trying many times, benefitting from speed of computer
#@eye
def test_split(index, value, dataset): def test_split(index, value, dataset):
left, right = list(), list() left, right = list(), list()
for row in dataset: for row in dataset:
@ -134,6 +143,7 @@ def test_split(index, value, dataset):
# Calculate the Gini index for a split dataset, using left/right og test split as groups # Calculate the Gini index for a split dataset, using left/right og test split as groups
# cfr calculating wealth distribution: https://en.wikipedia.org/wiki/Gini_coefficient # cfr calculating wealth distribution: https://en.wikipedia.org/wiki/Gini_coefficient
#@eye
def gini_index(groups, classes): def gini_index(groups, classes):
# count all samples at split point (the dataset), converts it in a float in order to do divisions # count all samples at split point (the dataset), converts it in a float in order to do divisions
n_instances = float(sum([len(group) for group in groups])) n_instances = float(sum([len(group) for group in groups]))
@ -158,6 +168,7 @@ def gini_index(groups, classes):
return gini return gini
# Select the best split point for a dataset # Select the best split point for a dataset
#@eye
def get_split(dataset, n_features): def get_split(dataset, n_features):
# takes last element of each row (class) and returns it as a row, as it is a set, it has only 2 values # takes last element of each row (class) and returns it as a row, as it is a set, it has only 2 values
class_values = list(set(row[-1] for row in dataset)) class_values = list(set(row[-1] for row in dataset))
@ -187,6 +198,7 @@ def get_split(dataset, n_features):
return {'index':b_index, 'value':b_value, 'groups':b_groups, 'gini':b_score} return {'index':b_index, 'value':b_value, 'groups':b_groups, 'gini':b_score}
# Create a terminal node value = node at end of the tree = end leaf # Create a terminal node value = node at end of the tree = end leaf
#@eye
def to_terminal(group): def to_terminal(group):
# returns list of classes of group # returns list of classes of group
outcomes = [row[-1] for row in group] outcomes = [row[-1] for row in group]
@ -195,6 +207,7 @@ def to_terminal(group):
return max(set(outcomes), key=outcomes.count) return max(set(outcomes), key=outcomes.count)
# Counts the amount of unique values in a 'group' (rows in dataset) # Counts the amount of unique values in a 'group' (rows in dataset)
#@eye
def count_unique_values (group): def count_unique_values (group):
# Pick classes in the dataset, transform to a set # Pick classes in the dataset, transform to a set
# count amount of values # count amount of values
@ -203,6 +216,7 @@ def count_unique_values (group):
# Create child splits for a node or make terminals/end leafs # Create child splits for a node or make terminals/end leafs
# recursive function, it calls itself # recursive function, it calls itself
# node is dictionary returned by get_split (b_index, b_value, b_groups) # node is dictionary returned by get_split (b_index, b_value, b_groups)
#@eye
def split(node, max_depth, min_size, n_features, depth): def split(node, max_depth, min_size, n_features, depth):
left, right = node['groups'] left, right = node['groups']
del(node['groups']) del(node['groups'])
@ -244,6 +258,7 @@ def split(node, max_depth, min_size, n_features, depth):
# return no value because functions are working on the same dictionaries # return no value because functions are working on the same dictionaries
# Build a decision tree # Build a decision tree
#@eye
def build_tree(train, max_depth, min_size, n_features): def build_tree(train, max_depth, min_size, n_features):
# root of decision tree is defined by dictionary of index, value, 2 groups (left/right of the split) # root of decision tree is defined by dictionary of index, value, 2 groups (left/right of the split)
root = get_split(train, n_features) root = get_split(train, n_features)
@ -255,6 +270,7 @@ def build_tree(train, max_depth, min_size, n_features):
# Make a prediction with a decision tree # Make a prediction with a decision tree
# recursive function as well # recursive function as well
#@eye
def predict(node, row): def predict(node, row):
# node index = column feature, it looks up value for this feature for this row in dataset # node index = column feature, it looks up value for this feature for this row in dataset
# compare feature value of row you're checking with feature value of node # compare feature value of row you're checking with feature value of node
@ -278,6 +294,7 @@ def predict(node, row):
# Create a random subsample from the dataset with replacement, ratio is called sample_size further on # Create a random subsample from the dataset with replacement, ratio is called sample_size further on
# This is called BOOTSTRAPPING: build new datasets from the original data, with the same number of rows # This is called BOOTSTRAPPING: build new datasets from the original data, with the same number of rows
# with replacement: after selecting the row we put it back into the data, so it can be selected twice or more # with replacement: after selecting the row we put it back into the data, so it can be selected twice or more
#@eye
def subsample(dataset, ratio): def subsample(dataset, ratio):
sample = list() sample = list()
# if it is smaller than 1, not all dataset is taken as sample - he uses the full dataset # if it is smaller than 1, not all dataset is taken as sample - he uses the full dataset
@ -288,6 +305,7 @@ def subsample(dataset, ratio):
return sample return sample
# Make a prediction with a list of bagged trees # Make a prediction with a list of bagged trees
#@eye
def bagging_predict(trees, row): def bagging_predict(trees, row):
# asks the forest to predict class for every row in the test data, this gives list of votes # asks the forest to predict class for every row in the test data, this gives list of votes
predictions = [predict(tree, row) for tree in trees] predictions = [predict(tree, row) for tree in trees]
@ -295,6 +313,7 @@ def bagging_predict(trees, row):
return max(set(predictions), key=predictions.count) return max(set(predictions), key=predictions.count)
# Random Forest Algorithm # Random Forest Algorithm
#@eye
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features, model_path=None): def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features, model_path=None):
trees = list() trees = list()
for _ in range(n_trees): for _ in range(n_trees):

Loading…
Cancel
Save