From 93b14bba40a3193b6ab76f201f5be9d8b588319f Mon Sep 17 00:00:00 2001 From: Gijs Date: Wed, 22 Jun 2022 14:51:43 +0200 Subject: [PATCH] Updated generated code. --- commenting_code_model/encode_model.py | 26 ++++++++++++++++--- .../random_forest_model_altered.py | 19 ++++++++++++++ 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/commenting_code_model/encode_model.py b/commenting_code_model/encode_model.py index 90f7e17..0ea80e4 100644 --- a/commenting_code_model/encode_model.py +++ b/commenting_code_model/encode_model.py @@ -36,9 +36,29 @@ def encode_tree(tree): def make_classifier (tree): lines = [ - "char predict(float *r) {", - *encode_tree(tree), - "}" + "#pragma once", + "#include ", + "namespace PublishingHouse", + "{", + *map(indent_line, [ + "namespace RandomForest", + "{", + *map(indent_line, [ + "class DecisionTree", + "{", + "public:", + *map(indent_line, [ + "char* predict(float *r)", + "{", + *encode_tree(tree), + "}", + ]), + "private:", + "};" + ]), + "}" + ]), + "}", ] return('\n'.join(lines)) diff --git a/commenting_code_model/random_forest_model_altered.py b/commenting_code_model/random_forest_model_altered.py index 4b61e04..f462328 100644 --- a/commenting_code_model/random_forest_model_altered.py +++ b/commenting_code_model/random_forest_model_altered.py @@ -9,11 +9,14 @@ from math import sqrt import json import os.path +#from birdseye import eye + # Get the directory of the current script to use in importing data # and exporting the model. basepath = os.path.dirname(os.path.realpath(__file__)) # Load a CSV file. Definition of the function to read the csv and create dataset here +#@eye def load_csv(filename): dataset = list() with open(filename, 'r') as file: @@ -25,11 +28,13 @@ def load_csv(filename): return dataset # Convert string column to float - original dataset is in string format +#@eye def str_column_to_float(dataset, column): for row in dataset: row[column] = float(row[column].strip()) # Convert string column to integer / transforms classes 'mine' and 'rock' into 1 and 2 +#@eye def str_column_to_int(dataset, column): # extracts values of the classes of the dataset: array of all the mine, rock class_values = [row[column] for row in dataset] @@ -49,6 +54,7 @@ def str_column_to_int(dataset, column): return lookup # Split a dataset into k folds +#@eye def cross_validation_split(dataset, n_folds): # creates list dataset_split = list() @@ -70,6 +76,7 @@ def cross_validation_split(dataset, n_folds): return dataset_split #return the dataset_split, a list of folds # Calculate accuracy percentage +#@eye def accuracy_metric(actual, predicted): correct = 0 # loops through index list which has length of actual classes @@ -82,6 +89,7 @@ def accuracy_metric(actual, predicted): return correct / float(len(actual)) * 100.0 # Evaluate an algorithm using a cross validation split +#@eye def evaluate_algorithm(dataset, algorithm, n_folds, *args): # split dataset in n folds folds = cross_validation_split(dataset, n_folds) @@ -120,6 +128,7 @@ def evaluate_algorithm(dataset, algorithm, n_folds, *args): # Split a dataset based on a feature and a feature value defined in build tree # just trying many times, benefitting from speed of computer +#@eye def test_split(index, value, dataset): left, right = list(), list() for row in dataset: @@ -134,6 +143,7 @@ def test_split(index, value, dataset): # Calculate the Gini index for a split dataset, using left/right og test split as groups # cfr calculating wealth distribution: https://en.wikipedia.org/wiki/Gini_coefficient +#@eye def gini_index(groups, classes): # count all samples at split point (the dataset), converts it in a float in order to do divisions n_instances = float(sum([len(group) for group in groups])) @@ -158,6 +168,7 @@ def gini_index(groups, classes): return gini # Select the best split point for a dataset +#@eye def get_split(dataset, n_features): # takes last element of each row (class) and returns it as a row, as it is a set, it has only 2 values class_values = list(set(row[-1] for row in dataset)) @@ -187,6 +198,7 @@ def get_split(dataset, n_features): return {'index':b_index, 'value':b_value, 'groups':b_groups, 'gini':b_score} # Create a terminal node value = node at end of the tree = end leaf +#@eye def to_terminal(group): # returns list of classes of group outcomes = [row[-1] for row in group] @@ -195,6 +207,7 @@ def to_terminal(group): return max(set(outcomes), key=outcomes.count) # Counts the amount of unique values in a 'group' (rows in dataset) +#@eye def count_unique_values (group): # Pick classes in the dataset, transform to a set # count amount of values @@ -203,6 +216,7 @@ def count_unique_values (group): # Create child splits for a node or make terminals/end leafs # recursive function, it calls itself # node is dictionary returned by get_split (b_index, b_value, b_groups) +#@eye def split(node, max_depth, min_size, n_features, depth): left, right = node['groups'] del(node['groups']) @@ -244,6 +258,7 @@ def split(node, max_depth, min_size, n_features, depth): # return no value because functions are working on the same dictionaries # Build a decision tree +#@eye def build_tree(train, max_depth, min_size, n_features): # root of decision tree is defined by dictionary of index, value, 2 groups (left/right of the split) root = get_split(train, n_features) @@ -255,6 +270,7 @@ def build_tree(train, max_depth, min_size, n_features): # Make a prediction with a decision tree # recursive function as well +#@eye def predict(node, row): # node index = column feature, it looks up value for this feature for this row in dataset # compare feature value of row you're checking with feature value of node @@ -278,6 +294,7 @@ def predict(node, row): # Create a random subsample from the dataset with replacement, ratio is called sample_size further on # This is called BOOTSTRAPPING: build new datasets from the original data, with the same number of rows # with replacement: after selecting the row we put it back into the data, so it can be selected twice or more +#@eye def subsample(dataset, ratio): sample = list() # if it is smaller than 1, not all dataset is taken as sample - he uses the full dataset @@ -288,6 +305,7 @@ def subsample(dataset, ratio): return sample # Make a prediction with a list of bagged trees +#@eye def bagging_predict(trees, row): # asks the forest to predict class for every row in the test data, this gives list of votes predictions = [predict(tree, row) for tree in trees] @@ -295,6 +313,7 @@ def bagging_predict(trees, row): return max(set(predictions), key=predictions.count) # Random Forest Algorithm +#@eye def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features, model_path=None): trees = list() for _ in range(n_trees):