diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..976ba81 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.pyc +commenting_code_model/visualizer/random_forest_model_tree_* diff --git a/README.md b/README.md index f8a66b7..592c8ee 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,9 @@ The repository has an annotated version of the algorithm, a dataset with trainin ## Running +Run the script *commenting_code_model/random_forest_model_altered.py* to generate random forests. The generated models will be json files in the same directory as the model. +Optionally run the *commenting_code_model/visualizer/visualizer.py* to generate visualizations. Those visualizations will the placed in the visualizer folder. ## Requirements diff --git a/commenting_code_model/random_forest_model_altered.py b/commenting_code_model/random_forest_model_altered.py index 015b54e..4b61e04 100644 --- a/commenting_code_model/random_forest_model_altered.py +++ b/commenting_code_model/random_forest_model_altered.py @@ -7,7 +7,12 @@ from random import randrange from csv import reader from math import sqrt import json - +import os.path + +# Get the directory of the current script to use in importing data +# and exporting the model. +basepath = os.path.dirname(os.path.realpath(__file__)) + # Load a CSV file. Definition of the function to read the csv and create dataset here def load_csv(filename): dataset = list() @@ -290,16 +295,17 @@ def bagging_predict(trees, row): return max(set(predictions), key=predictions.count) # Random Forest Algorithm -def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features): +def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features, model_path=None): trees = list() - for i in range(n_trees): + for _ in range(n_trees): sample = subsample(train, sample_size) # building a tree / root is dictionary with index, value, left/right) tree = build_tree(sample, max_depth, min_size, n_features) trees.append(tree) - with open('random_forest_model.json', 'w') as outfile: - json.dump(trees, outfile, indent = 6) + if model_path: + with open(model_path, 'w') as outfile: + json.dump(trees, outfile, indent = 6) # prediction using one of the folds we separated in the beginning, forest votes on every row of test data predictions = [bagging_predict(trees, row) for row in test] # returns votes/predictions of the forest @@ -311,7 +317,7 @@ def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_feat seed(2) # load and prepare data # filename = 'sonar_csv.csv' -filename = 'iris_data.csv' +filename = os.path.join(basepath, 'iris_data.csv') dataset = load_csv(filename) #print(dataset) # convert string attributes to integers @@ -331,8 +337,10 @@ sample_size = 1.0 # it specifies the size of the subset of features for the folds, where the size is close to the square root of the total number of features n_features = int(sqrt(len(dataset[0])-1)) # it tries forest of 1 tree, 5 trees, 10 trees + for n_trees in [1, 5, 10]: - scores = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features) + model_path = os.path.join(basepath, 'random_forest_model_{}-trees.json'.format(n_trees)) + scores = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features, model_path) print('Trees: %d' % n_trees) print('Scores: %s' % scores) print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores)))) @@ -344,8 +352,8 @@ for n_trees in [1, 5, 10]: #### pickle trees #### use: unpickle trees + bagging_predict with new data -with open('random_forest_model.json', 'r') as infile: - trees = json.load(infile) - prediction = bagging_predict(trees, dataset[23]) - # this gives a number, you have to reorganise model to get back the string of the class - print(prediction) \ No newline at end of file +# with open('random_forest_model.json', 'r') as infile: +# trees = json.load(infile) +# prediction = bagging_predict(trees, dataset[23]) +# # this gives a number, you have to reorganise model to get back the string of the class +# print(prediction) \ No newline at end of file diff --git a/commenting_code_model/visualizer/visualizer.py b/commenting_code_model/visualizer/visualizer.py index f1fe10a..f8cd3c3 100644 --- a/commenting_code_model/visualizer/visualizer.py +++ b/commenting_code_model/visualizer/visualizer.py @@ -59,16 +59,36 @@ def make_graph (graphname): graph.attr('graph', splines='line', rankdir='BT') return graph -def visualize (tree, graphname, generate_node_name = make_name_generator(length=3)): +def visualize (tree, graphname, generate_node_name = make_name_generator(length=3), directory=None): graph = make_graph(graphname) visualize_node(graph, generate_node_name, tree) - graph.render(graphname) + graph.render(graphname, directory=directory) if __name__ == '__main__': import json + import os.path + import glob + basepath = os.path.dirname(os.path.realpath(__file__)) + globpath = os.path.realpath(os.path.join(basepath, '..', 'random_forest_model_*-trees.json')) + + models = glob.glob(globpath) + + # Search for exported models + for modelpath in models: + print("Found {}".format(modelpath)) - with open('../random_forest_model.json', 'r') as file_in: - forest = json.load(file_in) + # Open model + with open(modelpath, 'r') as file_in: + # Parse the forest + forest = json.load(file_in) + modelname, _ = os.path.splitext(os.path.basename(modelpath)) + graphnamepattern = '{}_tree_{{}}'.format(modelname) - for idx, tree in enumerate(forest): - visualize(tree, 'random-tree-{}'.format(idx)) \ No newline at end of file + # Walk through the forest and visualize the trees + for idx, tree in enumerate(forest): + graphname = graphnamepattern.format(idx, len(forest)) + print('Visualizing tree {} of {}'.format(idx, len(forest))) + visualize(tree, graphname, directory=basepath) + print() + + print('Graphs placed in: {}'.format(basepath)) \ No newline at end of file