Updated readme. Altered scripts to use absolute paths so they can be ran from everywhere and still place exports, or load imports from relative paths.

main
Gijs 3 years ago
parent f045b8158d
commit 44222e5a13

2
.gitignore vendored

@ -0,0 +1,2 @@
*.pyc
commenting_code_model/visualizer/random_forest_model_tree_*

@ -17,7 +17,9 @@ The repository has an annotated version of the algorithm, a dataset with trainin
## Running ## Running
Run the script *commenting_code_model/random_forest_model_altered.py* to generate random forests. The generated models will be json files in the same directory as the model.
Optionally run the *commenting_code_model/visualizer/visualizer.py* to generate visualizations. Those visualizations will the placed in the visualizer folder.
## Requirements ## Requirements

@ -7,7 +7,12 @@ from random import randrange
from csv import reader from csv import reader
from math import sqrt from math import sqrt
import json import json
import os.path
# Get the directory of the current script to use in importing data
# and exporting the model.
basepath = os.path.dirname(os.path.realpath(__file__))
# Load a CSV file. Definition of the function to read the csv and create dataset here # Load a CSV file. Definition of the function to read the csv and create dataset here
def load_csv(filename): def load_csv(filename):
dataset = list() dataset = list()
@ -290,16 +295,17 @@ def bagging_predict(trees, row):
return max(set(predictions), key=predictions.count) return max(set(predictions), key=predictions.count)
# Random Forest Algorithm # Random Forest Algorithm
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features): def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features, model_path=None):
trees = list() trees = list()
for i in range(n_trees): for _ in range(n_trees):
sample = subsample(train, sample_size) sample = subsample(train, sample_size)
# building a tree / root is dictionary with index, value, left/right) # building a tree / root is dictionary with index, value, left/right)
tree = build_tree(sample, max_depth, min_size, n_features) tree = build_tree(sample, max_depth, min_size, n_features)
trees.append(tree) trees.append(tree)
with open('random_forest_model.json', 'w') as outfile: if model_path:
json.dump(trees, outfile, indent = 6) with open(model_path, 'w') as outfile:
json.dump(trees, outfile, indent = 6)
# prediction using one of the folds we separated in the beginning, forest votes on every row of test data # prediction using one of the folds we separated in the beginning, forest votes on every row of test data
predictions = [bagging_predict(trees, row) for row in test] predictions = [bagging_predict(trees, row) for row in test]
# returns votes/predictions of the forest # returns votes/predictions of the forest
@ -311,7 +317,7 @@ def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_feat
seed(2) seed(2)
# load and prepare data # load and prepare data
# filename = 'sonar_csv.csv' # filename = 'sonar_csv.csv'
filename = 'iris_data.csv' filename = os.path.join(basepath, 'iris_data.csv')
dataset = load_csv(filename) dataset = load_csv(filename)
#print(dataset) #print(dataset)
# convert string attributes to integers # convert string attributes to integers
@ -331,8 +337,10 @@ sample_size = 1.0
# it specifies the size of the subset of features for the folds, where the size is close to the square root of the total number of features # it specifies the size of the subset of features for the folds, where the size is close to the square root of the total number of features
n_features = int(sqrt(len(dataset[0])-1)) n_features = int(sqrt(len(dataset[0])-1))
# it tries forest of 1 tree, 5 trees, 10 trees # it tries forest of 1 tree, 5 trees, 10 trees
for n_trees in [1, 5, 10]: for n_trees in [1, 5, 10]:
scores = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features) model_path = os.path.join(basepath, 'random_forest_model_{}-trees.json'.format(n_trees))
scores = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features, model_path)
print('Trees: %d' % n_trees) print('Trees: %d' % n_trees)
print('Scores: %s' % scores) print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores)))) print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
@ -344,8 +352,8 @@ for n_trees in [1, 5, 10]:
#### pickle trees #### pickle trees
#### use: unpickle trees + bagging_predict with new data #### use: unpickle trees + bagging_predict with new data
with open('random_forest_model.json', 'r') as infile: # with open('random_forest_model.json', 'r') as infile:
trees = json.load(infile) # trees = json.load(infile)
prediction = bagging_predict(trees, dataset[23]) # prediction = bagging_predict(trees, dataset[23])
# this gives a number, you have to reorganise model to get back the string of the class # # this gives a number, you have to reorganise model to get back the string of the class
print(prediction) # print(prediction)

@ -59,16 +59,36 @@ def make_graph (graphname):
graph.attr('graph', splines='line', rankdir='BT') graph.attr('graph', splines='line', rankdir='BT')
return graph return graph
def visualize (tree, graphname, generate_node_name = make_name_generator(length=3)): def visualize (tree, graphname, generate_node_name = make_name_generator(length=3), directory=None):
graph = make_graph(graphname) graph = make_graph(graphname)
visualize_node(graph, generate_node_name, tree) visualize_node(graph, generate_node_name, tree)
graph.render(graphname) graph.render(graphname, directory=directory)
if __name__ == '__main__': if __name__ == '__main__':
import json import json
import os.path
import glob
basepath = os.path.dirname(os.path.realpath(__file__))
globpath = os.path.realpath(os.path.join(basepath, '..', 'random_forest_model_*-trees.json'))
models = glob.glob(globpath)
# Search for exported models
for modelpath in models:
print("Found {}".format(modelpath))
with open('../random_forest_model.json', 'r') as file_in: # Open model
forest = json.load(file_in) with open(modelpath, 'r') as file_in:
# Parse the forest
forest = json.load(file_in)
modelname, _ = os.path.splitext(os.path.basename(modelpath))
graphnamepattern = '{}_tree_{{}}'.format(modelname)
for idx, tree in enumerate(forest): # Walk through the forest and visualize the trees
visualize(tree, 'random-tree-{}'.format(idx)) for idx, tree in enumerate(forest):
graphname = graphnamepattern.format(idx, len(forest))
print('Visualizing tree {} of {}'.format(idx, len(forest)))
visualize(tree, graphname, directory=basepath)
print()
print('Graphs placed in: {}'.format(basepath))
Loading…
Cancel
Save