@ -7,7 +7,12 @@ from random import randrange
from csv import reader
from math import sqrt
import json
import os . path
# Get the directory of the current script to use in importing data
# and exporting the model.
basepath = os . path . dirname ( os . path . realpath ( __file__ ) )
# Load a CSV file. Definition of the function to read the csv and create dataset here
def load_csv ( filename ) :
dataset = list ( )
@ -290,16 +295,17 @@ def bagging_predict(trees, row):
return max ( set ( predictions ) , key = predictions . count )
# Random Forest Algorithm
def random_forest ( train , test , max_depth , min_size , sample_size , n_trees , n_features ):
def random_forest ( train , test , max_depth , min_size , sample_size , n_trees , n_features , model_path = None ):
trees = list ( )
for i in range ( n_trees ) :
for _ in range ( n_trees ) :
sample = subsample ( train , sample_size )
# building a tree / root is dictionary with index, value, left/right)
tree = build_tree ( sample , max_depth , min_size , n_features )
trees . append ( tree )
with open ( ' random_forest_model.json ' , ' w ' ) as outfile :
json . dump ( trees , outfile , indent = 6 )
if model_path :
with open ( model_path , ' w ' ) as outfile :
json . dump ( trees , outfile , indent = 6 )
# prediction using one of the folds we separated in the beginning, forest votes on every row of test data
predictions = [ bagging_predict ( trees , row ) for row in test ]
# returns votes/predictions of the forest
@ -311,7 +317,7 @@ def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_feat
seed ( 2 )
# load and prepare data
# filename = 'sonar_csv.csv'
filename = ' iris_data.csv '
filename = os . path . join ( basepath , ' iris_data.csv ' )
dataset = load_csv ( filename )
#print(dataset)
# convert string attributes to integers
@ -331,8 +337,10 @@ sample_size = 1.0
# it specifies the size of the subset of features for the folds, where the size is close to the square root of the total number of features
n_features = int ( sqrt ( len ( dataset [ 0 ] ) - 1 ) )
# it tries forest of 1 tree, 5 trees, 10 trees
for n_trees in [ 1 , 5 , 10 ] :
scores = evaluate_algorithm ( dataset , random_forest , n_folds , max_depth , min_size , sample_size , n_trees , n_features )
model_path = os . path . join ( basepath , ' random_forest_model_ {} -trees.json ' . format ( n_trees ) )
scores = evaluate_algorithm ( dataset , random_forest , n_folds , max_depth , min_size , sample_size , n_trees , n_features , model_path )
print ( ' Trees: %d ' % n_trees )
print ( ' Scores: %s ' % scores )
print ( ' Mean Accuracy: %.3f %% ' % ( sum ( scores ) / float ( len ( scores ) ) ) )
@ -344,8 +352,8 @@ for n_trees in [1, 5, 10]:
#### pickle trees
#### use: unpickle trees + bagging_predict with new data
with open ( ' random_forest_model.json ' , ' r ' ) as infile :
trees = json . load ( infile )
prediction = bagging_predict ( trees , dataset [ 23 ] )
# this gives a number, you have to reorganise model to get back the string of the class
print ( prediction )
# with open('random_forest_model.json', 'r') as infile :
# trees = json.load(infile)
# prediction = bagging_predict(trees, dataset[23])
# # this gives a number, you have to reorganise model to get back the string of the class
# print(prediction)