Updated readme. Altered scripts to use absolute paths so they can be ran from everywhere and still place exports, or load imports from relative paths.

4 years ago · 44222e5a13
parent f045b8158d
commit 44222e5a13
4 changed files with 50 additions and 18 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
 *.pyc
 commenting_code_model/visualizer/random_forest_model_tree_*
--- a/README.md
+++ b/README.md
@ -17,7 +17,9 @@ The repository has an annotated version of the algorithm, a dataset with trainin
 ## Running
 Run the script *commenting_code_model/random_forest_model_altered.py* to generate random forests. The generated models will be json files in the same directory as the model.
 Optionally run the *commenting_code_model/visualizer/visualizer.py* to generate visualizations. Those visualizations will the placed in the visualizer folder.
 ## Requirements
--- a/commenting_code_model/random_forest_model_altered.py
+++ b/commenting_code_model/random_forest_model_altered.py
@ -7,6 +7,11 @@ from random import randrange
 from csv import reader
 from math import sqrt
 import json
 import os.path
 # Get the directory of the current script to use in importing data
 # and exporting the model.
 basepath = os.path.dirname(os.path.realpath(__file__))
 # Load a CSV file. Definition of the function to read the csv and create dataset here
 def load_csv(filename):
@ -290,15 +295,16 @@ def bagging_predict(trees, row):
 	return max(set(predictions), key=predictions.count)
 # Random Forest Algorithm
-def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
+def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features, model_path=None):
 	trees = list()
-	for i in range(n_trees):
+	for _ in range(n_trees):
 		sample = subsample(train, sample_size)
 		# building a tree / root is dictionary with index, value, left/right)
 		tree = build_tree(sample, max_depth, min_size, n_features)
 		trees.append(tree)
-	with open('random_forest_model.json', 'w') as outfile:
+	if model_path:
 		with open(model_path, 'w') as outfile:
 			json.dump(trees, outfile, indent = 6)
 	# prediction using one of the folds we separated in the beginning, forest votes on every row of test data
 	predictions = [bagging_predict(trees, row) for row in test]
@ -311,7 +317,7 @@ def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_feat
 seed(2)
 # load and prepare data
 # filename = 'sonar_csv.csv'
-filename = 'iris_data.csv'
+filename = os.path.join(basepath, 'iris_data.csv')
 dataset = load_csv(filename)
 #print(dataset)
 # convert string attributes to integers
@ -331,8 +337,10 @@ sample_size = 1.0
 # it specifies the size of the subset of features for the folds, where the size is close to the square root of the total number of features
 n_features = int(sqrt(len(dataset[0])-1))
 # it tries forest of 1 tree, 5 trees, 10 trees
 for n_trees in [1, 5, 10]:
-	scores = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features)
+	model_path = os.path.join(basepath, 'random_forest_model_{}-trees.json'.format(n_trees))
 	scores = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features, model_path)
 	print('Trees: %d' % n_trees)
 	print('Scores: %s' % scores)
 	print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
@ -344,8 +352,8 @@ for n_trees in [1, 5, 10]:
 #### pickle trees
 #### use: unpickle trees + bagging_predict with new data
-with open('random_forest_model.json', 'r') as infile:
+# with open('random_forest_model.json', 'r') as infile:
-	trees = json.load(infile)
+# 	trees = json.load(infile)
-	prediction = bagging_predict(trees, dataset[23])
+# 	prediction = bagging_predict(trees, dataset[23])
-	# this gives a number, you have to reorganise model to get back the string of the class
+# 	# this gives a number, you have to reorganise model to get back the string of the class
-	print(prediction)
+# 	print(prediction)
--- a/commenting_code_model/visualizer/visualizer.py
+++ b/commenting_code_model/visualizer/visualizer.py
@ -59,16 +59,36 @@ def make_graph (graphname):
  graph.attr('graph', splines='line', rankdir='BT')
  return graph
-def visualize (tree, graphname, generate_node_name = make_name_generator(length=3)):
+def visualize (tree, graphname, generate_node_name = make_name_generator(length=3), directory=None):
  graph = make_graph(graphname)
  visualize_node(graph, generate_node_name, tree)
-  graph.render(graphname)
+  graph.render(graphname, directory=directory)
 if __name__ == '__main__':
  import json
  import os.path
  import glob
  basepath = os.path.dirname(os.path.realpath(__file__))
  globpath = os.path.realpath(os.path.join(basepath, '..', 'random_forest_model_*-trees.json'))
-  with open('../random_forest_model.json', 'r') as file_in:
+  models = glob.glob(globpath)
  # Search for exported models
  for modelpath in models:
    print("Found {}".format(modelpath))
    # Open model
    with open(modelpath, 'r') as file_in:
      # Parse the forest
      forest = json.load(file_in)
      modelname, _ = os.path.splitext(os.path.basename(modelpath))
      graphnamepattern = '{}_tree_{{}}'.format(modelname)
      # Walk through the forest and visualize the trees
      for idx, tree in enumerate(forest):
-      visualize(tree, 'random-tree-{}'.format(idx))
+        graphname = graphnamepattern.format(idx, len(forest))
        print('Visualizing tree {} of {}'.format(idx, len(forest)))
        visualize(tree, graphname, directory=basepath)
      print()
  print('Graphs placed in: {}'.format(basepath))
		`@ -0,0 +1,2 @@`
							`*.pyc`
							`commenting_code_model/visualizer/random_forest_model_tree_*`