From e69058624e1038015b04c97090ea63bcaa56abdb Mon Sep 17 00:00:00 2001 From: Dorian Date: Fri, 16 Dec 2022 18:01:14 +0100 Subject: [PATCH] new experimentation on sensible simalirity metrics between 2 sentences --- exp.subjective-similarity-metrics/main.py | 91 +++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 exp.subjective-similarity-metrics/main.py diff --git a/exp.subjective-similarity-metrics/main.py b/exp.subjective-similarity-metrics/main.py new file mode 100644 index 0000000..638ed8d --- /dev/null +++ b/exp.subjective-similarity-metrics/main.py @@ -0,0 +1,91 @@ +import sys +import json +import glob +from graphviz import Graph + +# appending a path +sys.path.append('../') + +# importing customised module +import summa.edits +from summa.edits import csv_to_matrix, custom_summarize + + +# main +# ------------------------------------------------------------------------ + + +def print_score(name, scored_sentences): + print(name) + for s in sorted_sentences: + print(s.text) + print(s.score) + print() + print() + + +if __name__ == '__main__': + + paths = glob.glob('../summa/data/*.csv') + + matrices = [] + + for path in paths: + + # CSV 2 MATRIC + matrix = csv_to_matrix(path) + matrices.append(matrix) + # print(json.dumps(matrix, indent=4)) + + # we give summa the whole text to tokenize, from the csv keys + text = " ".join(matrix.keys()) + + # random walk with pagerank + scored_sentences, weighted_graph = custom_summarize(text, matrix) + + # sort the scored sentences + sorted_sentences = sorted(scored_sentences, key=lambda s: s.score, reverse=True) + + # print them + print_score(path, sorted_sentences) + + # create a graph + graphviz_graph = Graph(path, engine='fdp', format='svg') + graphviz_graph.attr('graph', overlap='false') + + for i, node1 in enumerate(weighted_graph.nodes()): + graphviz_graph.node(str(i), node1) + + for j, node2 in enumerate(weighted_graph.nodes()): + + if weighted_graph.edge_weight((node1, node2)) != weighted_graph.edge_weight((node2, node1)): + print("THIS IS NOT SYMMETRICAL") + + if i < j: + original_weight = weighted_graph.edge_weight((node1, node2)) + graphviz_graph.edge(str(i), str(j), weight = str(original_weight * 10), penwidth = str(original_weight)) + + elif i > j: + original_weight = weighted_graph.edge_weight((node2, node1)) + graphviz_graph.edge(str(j), str(i), weight = str(original_weight * 10), penwidth = str(original_weight)) + + + graphviz_graph.render(path + '_graph') + + + # # compute the average matrix + # matrix_keys = matrices[0].keys() + # average_matrix = {} + # for i in matrix_keys: + # average_matrix[i] = {} + # for j in matrix_keys: + # average_matrix[i][j] = sum([matrix[i][j] for matrix in matrices]) / len(matrices) + + # # random walk with pagerank + # scored_sentences = custom_summarize(text, average_matrix) + # # sort the scored sentences + # sorted_sentences = sorted(scored_sentences, key=lambda s: s.score, reverse=True) + + # # print them + # print_score('average', sorted_sentences) +