new experimentation on sensible simalirity metrics between 2 sentences

3 years ago · e69058624e
parent 40ea2cae20
commit e69058624e
1 changed files with 91 additions and 0 deletions
--- a/exp.subjective-similarity-metrics/main.py
+++ b/exp.subjective-similarity-metrics/main.py
@ -0,0 +1,91 @@
+import sys
+import json
+import glob
+from graphviz import Graph
+
+# appending a path
+sys.path.append('../')
+  
+# importing customised module
+import summa.edits
+from summa.edits import csv_to_matrix, custom_summarize
+
+
+#   main
+#   ------------------------------------------------------------------------
+
+
+def print_score(name, scored_sentences):
+    print(name)
+    for s in sorted_sentences:
+        print(s.text)
+        print(s.score)
+        print()
+    print()
+
+
+if __name__ == '__main__':
+
+    paths = glob.glob('../summa/data/*.csv')
+
+    matrices = []
+
+    for path in paths:
+
+        # CSV 2 MATRIC
+        matrix = csv_to_matrix(path)
+        matrices.append(matrix)
+        # print(json.dumps(matrix, indent=4))
+
+        # we give summa the whole text to tokenize, from the csv keys
+        text = " ".join(matrix.keys())
+        
+        # random walk with pagerank 
+        scored_sentences, weighted_graph = custom_summarize(text, matrix)
+
+        # sort the scored sentences
+        sorted_sentences = sorted(scored_sentences, key=lambda s: s.score, reverse=True)
+
+        # print them
+        print_score(path, sorted_sentences)
+
+        # create a graph
+        graphviz_graph = Graph(path, engine='fdp', format='svg')
+        graphviz_graph.attr('graph', overlap='false')
+
+        for i, node1 in enumerate(weighted_graph.nodes()):
+            graphviz_graph.node(str(i), node1)
+
+            for j, node2 in enumerate(weighted_graph.nodes()):
+
+                if weighted_graph.edge_weight((node1, node2)) != weighted_graph.edge_weight((node2, node1)):
+                    print("THIS IS NOT SYMMETRICAL")
+
+                if i < j:
+                    original_weight = weighted_graph.edge_weight((node1, node2))
+                    graphviz_graph.edge(str(i), str(j), weight = str(original_weight * 10), penwidth = str(original_weight))
+                
+                elif i > j:
+                    original_weight = weighted_graph.edge_weight((node2, node1))
+                    graphviz_graph.edge(str(j), str(i), weight = str(original_weight * 10), penwidth = str(original_weight))
+                
+
+        graphviz_graph.render(path + '_graph')
+
+
+    # # compute the average matrix
+    # matrix_keys = matrices[0].keys()
+    # average_matrix = {}
+    # for i in matrix_keys:
+    #     average_matrix[i] = {}
+    #     for j in matrix_keys:
+    #         average_matrix[i][j] = sum([matrix[i][j] for matrix in matrices]) / len(matrices)
+
+    # # random walk with pagerank
+    # scored_sentences = custom_summarize(text, average_matrix)
+    # # sort the scored sentences
+    # sorted_sentences = sorted(scored_sentences, key=lambda s: s.score, reverse=True)
+
+    # # print them
+    # print_score('average', sorted_sentences)
+