new experimentation on sensible simalirity metrics between 2 sentences

3 years ago · e69058624e
parent 40ea2cae20
commit e69058624e
1 changed files with 91 additions and 0 deletions
--- a/exp.subjective-similarity-metrics/main.py
+++ b/exp.subjective-similarity-metrics/main.py
@ -0,0 +1,91 @@
 import sys
 import json
 import glob
 from graphviz import Graph
 # appending a path
 sys.path.append('../')
 # importing customised module
 import summa.edits
 from summa.edits import csv_to_matrix, custom_summarize
 #   main
 #   ------------------------------------------------------------------------
 def print_score(name, scored_sentences):
    print(name)
    for s in sorted_sentences:
        print(s.text)
        print(s.score)
        print()
    print()
 if __name__ == '__main__':
    paths = glob.glob('../summa/data/*.csv')
    matrices = []
    for path in paths:
        # CSV 2 MATRIC
        matrix = csv_to_matrix(path)
        matrices.append(matrix)
        # print(json.dumps(matrix, indent=4))
        # we give summa the whole text to tokenize, from the csv keys
        text = " ".join(matrix.keys())
        # random walk with pagerank 
        scored_sentences, weighted_graph = custom_summarize(text, matrix)
        # sort the scored sentences
        sorted_sentences = sorted(scored_sentences, key=lambda s: s.score, reverse=True)
        # print them
        print_score(path, sorted_sentences)
        # create a graph
        graphviz_graph = Graph(path, engine='fdp', format='svg')
        graphviz_graph.attr('graph', overlap='false')
        for i, node1 in enumerate(weighted_graph.nodes()):
            graphviz_graph.node(str(i), node1)
            for j, node2 in enumerate(weighted_graph.nodes()):
                if weighted_graph.edge_weight((node1, node2)) != weighted_graph.edge_weight((node2, node1)):
                    print("THIS IS NOT SYMMETRICAL")
                if i < j:
                    original_weight = weighted_graph.edge_weight((node1, node2))
                    graphviz_graph.edge(str(i), str(j), weight = str(original_weight * 10), penwidth = str(original_weight))
                elif i > j:
                    original_weight = weighted_graph.edge_weight((node2, node1))
                    graphviz_graph.edge(str(j), str(i), weight = str(original_weight * 10), penwidth = str(original_weight))
        graphviz_graph.render(path + '_graph')
    # # compute the average matrix
    # matrix_keys = matrices[0].keys()
    # average_matrix = {}
    # for i in matrix_keys:
    #     average_matrix[i] = {}
    #     for j in matrix_keys:
    #         average_matrix[i][j] = sum([matrix[i][j] for matrix in matrices]) / len(matrices)
    # # random walk with pagerank
    # scored_sentences = custom_summarize(text, average_matrix)
    # # sort the scored sentences
    # sorted_sentences = sorted(scored_sentences, key=lambda s: s.score, reverse=True)
    # # print them
    # print_score('average', sorted_sentences)