Updated readme. Altered scripts to use absolute paths so they can be ran from everywhere and still place exports, or load imports from relative paths.

4 years ago · 44222e5a13
parent f045b8158d
commit 44222e5a13
4 changed files with 50 additions and 18 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+*.pyc
+commenting_code_model/visualizer/random_forest_model_tree_*
--- a/README.md
+++ b/README.md
@ -17,7 +17,9 @@ The repository has an annotated version of the algorithm, a dataset with trainin

 ## Running

+Run the script *commenting_code_model/random_forest_model_altered.py* to generate random forests. The generated models will be json files in the same directory as the model.

+Optionally run the *commenting_code_model/visualizer/visualizer.py* to generate visualizations. Those visualizations will the placed in the visualizer folder.

 ## Requirements

--- a/commenting_code_model/random_forest_model_altered.py
+++ b/commenting_code_model/random_forest_model_altered.py
@ -7,7 +7,12 @@ from random import randrange
 from csv import reader
 from math import sqrt
 import json
- 
+import os.path
+
+# Get the directory of the current script to use in importing data
+# and exporting the model.
+basepath = os.path.dirname(os.path.realpath(__file__))
+
 # Load a CSV file. Definition of the function to read the csv and create dataset here
 def load_csv(filename):
 	dataset = list()
@ -290,16 +295,17 @@ def bagging_predict(trees, row):
 	return max(set(predictions), key=predictions.count)
 
 # Random Forest Algorithm
-def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
+def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features, model_path=None):
 	trees = list()
-	for i in range(n_trees):
+	for _ in range(n_trees):
 		sample = subsample(train, sample_size)
 		# building a tree / root is dictionary with index, value, left/right)
 		tree = build_tree(sample, max_depth, min_size, n_features)
 		trees.append(tree)

-	with open('random_forest_model.json', 'w') as outfile:
-		json.dump(trees, outfile, indent = 6)
+	if model_path:
+		with open(model_path, 'w') as outfile:
+			json.dump(trees, outfile, indent = 6)
 	# prediction using one of the folds we separated in the beginning, forest votes on every row of test data
 	predictions = [bagging_predict(trees, row) for row in test]
 	# returns votes/predictions of the forest
@ -311,7 +317,7 @@ def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_feat
 seed(2)
 # load and prepare data
 # filename = 'sonar_csv.csv'
-filename = 'iris_data.csv'
+filename = os.path.join(basepath, 'iris_data.csv')
 dataset = load_csv(filename)
 #print(dataset)
 # convert string attributes to integers
@ -331,8 +337,10 @@ sample_size = 1.0
 # it specifies the size of the subset of features for the folds, where the size is close to the square root of the total number of features
 n_features = int(sqrt(len(dataset[0])-1))
 # it tries forest of 1 tree, 5 trees, 10 trees
+  
 for n_trees in [1, 5, 10]:
-	scores = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features)
+	model_path = os.path.join(basepath, 'random_forest_model_{}-trees.json'.format(n_trees))
+	scores = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features, model_path)
 	print('Trees: %d' % n_trees)
 	print('Scores: %s' % scores)
 	print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
@ -344,8 +352,8 @@ for n_trees in [1, 5, 10]:
 #### pickle trees
 #### use: unpickle trees + bagging_predict with new data

-with open('random_forest_model.json', 'r') as infile:
-	trees = json.load(infile)
-	prediction = bagging_predict(trees, dataset[23])
-	# this gives a number, you have to reorganise model to get back the string of the class
-	print(prediction)
+# with open('random_forest_model.json', 'r') as infile:
+# 	trees = json.load(infile)
+# 	prediction = bagging_predict(trees, dataset[23])
+# 	# this gives a number, you have to reorganise model to get back the string of the class
+# 	print(prediction)
--- a/commenting_code_model/visualizer/visualizer.py
+++ b/commenting_code_model/visualizer/visualizer.py
@ -59,16 +59,36 @@ def make_graph (graphname):
  graph.attr('graph', splines='line', rankdir='BT')
  return graph

-def visualize (tree, graphname, generate_node_name = make_name_generator(length=3)):
+def visualize (tree, graphname, generate_node_name = make_name_generator(length=3), directory=None):
  graph = make_graph(graphname)
  visualize_node(graph, generate_node_name, tree)
-  graph.render(graphname)
+  graph.render(graphname, directory=directory)
  
 if __name__ == '__main__':
  import json
+  import os.path
+  import glob
+  basepath = os.path.dirname(os.path.realpath(__file__))
+  globpath = os.path.realpath(os.path.join(basepath, '..', 'random_forest_model_*-trees.json'))
+  
+  models = glob.glob(globpath)
+
+  # Search for exported models
+  for modelpath in models:
+    print("Found {}".format(modelpath))

-  with open('../random_forest_model.json', 'r') as file_in:
-    forest = json.load(file_in)
+    # Open model
+    with open(modelpath, 'r') as file_in:
+      # Parse the forest
+      forest = json.load(file_in)
+      modelname, _ = os.path.splitext(os.path.basename(modelpath))
+      graphnamepattern = '{}_tree_{{}}'.format(modelname)

-    for idx, tree in enumerate(forest):
-      visualize(tree, 'random-tree-{}'.format(idx))
+      # Walk through the forest and visualize the trees
+      for idx, tree in enumerate(forest):
+        graphname = graphnamepattern.format(idx, len(forest))
+        print('Visualizing tree {} of {}'.format(idx, len(forest)))
+        visualize(tree, graphname, directory=basepath)
+      print()
+      
+  print('Graphs placed in: {}'.format(basepath))