new experimentation on sensible simalirity metrics between 2 sentences

master
Dorian 2 years ago
parent 40ea2cae20
commit e69058624e

@ -0,0 +1,91 @@
import sys
import json
import glob
from graphviz import Graph
# appending a path
sys.path.append('../')
# importing customised module
import summa.edits
from summa.edits import csv_to_matrix, custom_summarize
# main
# ------------------------------------------------------------------------
def print_score(name, scored_sentences):
print(name)
for s in sorted_sentences:
print(s.text)
print(s.score)
print()
print()
if __name__ == '__main__':
paths = glob.glob('../summa/data/*.csv')
matrices = []
for path in paths:
# CSV 2 MATRIC
matrix = csv_to_matrix(path)
matrices.append(matrix)
# print(json.dumps(matrix, indent=4))
# we give summa the whole text to tokenize, from the csv keys
text = " ".join(matrix.keys())
# random walk with pagerank
scored_sentences, weighted_graph = custom_summarize(text, matrix)
# sort the scored sentences
sorted_sentences = sorted(scored_sentences, key=lambda s: s.score, reverse=True)
# print them
print_score(path, sorted_sentences)
# create a graph
graphviz_graph = Graph(path, engine='fdp', format='svg')
graphviz_graph.attr('graph', overlap='false')
for i, node1 in enumerate(weighted_graph.nodes()):
graphviz_graph.node(str(i), node1)
for j, node2 in enumerate(weighted_graph.nodes()):
if weighted_graph.edge_weight((node1, node2)) != weighted_graph.edge_weight((node2, node1)):
print("THIS IS NOT SYMMETRICAL")
if i < j:
original_weight = weighted_graph.edge_weight((node1, node2))
graphviz_graph.edge(str(i), str(j), weight = str(original_weight * 10), penwidth = str(original_weight))
elif i > j:
original_weight = weighted_graph.edge_weight((node2, node1))
graphviz_graph.edge(str(j), str(i), weight = str(original_weight * 10), penwidth = str(original_weight))
graphviz_graph.render(path + '_graph')
# # compute the average matrix
# matrix_keys = matrices[0].keys()
# average_matrix = {}
# for i in matrix_keys:
# average_matrix[i] = {}
# for j in matrix_keys:
# average_matrix[i][j] = sum([matrix[i][j] for matrix in matrices]) / len(matrices)
# # random walk with pagerank
# scored_sentences = custom_summarize(text, average_matrix)
# # sort the scored sentences
# sorted_sentences = sorted(scored_sentences, key=lambda s: s.score, reverse=True)
# # print them
# print_score('average', sorted_sentences)
Loading…
Cancel
Save