new experimentation on sensible simalirity metrics between 2 sentences
parent
40ea2cae20
commit
e69058624e
@ -0,0 +1,91 @@
|
||||
import sys
|
||||
import json
|
||||
import glob
|
||||
from graphviz import Graph
|
||||
|
||||
# appending a path
|
||||
sys.path.append('../')
|
||||
|
||||
# importing customised module
|
||||
import summa.edits
|
||||
from summa.edits import csv_to_matrix, custom_summarize
|
||||
|
||||
|
||||
# main
|
||||
# ------------------------------------------------------------------------
|
||||
|
||||
|
||||
def print_score(name, scored_sentences):
|
||||
print(name)
|
||||
for s in sorted_sentences:
|
||||
print(s.text)
|
||||
print(s.score)
|
||||
print()
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
paths = glob.glob('../summa/data/*.csv')
|
||||
|
||||
matrices = []
|
||||
|
||||
for path in paths:
|
||||
|
||||
# CSV 2 MATRIC
|
||||
matrix = csv_to_matrix(path)
|
||||
matrices.append(matrix)
|
||||
# print(json.dumps(matrix, indent=4))
|
||||
|
||||
# we give summa the whole text to tokenize, from the csv keys
|
||||
text = " ".join(matrix.keys())
|
||||
|
||||
# random walk with pagerank
|
||||
scored_sentences, weighted_graph = custom_summarize(text, matrix)
|
||||
|
||||
# sort the scored sentences
|
||||
sorted_sentences = sorted(scored_sentences, key=lambda s: s.score, reverse=True)
|
||||
|
||||
# print them
|
||||
print_score(path, sorted_sentences)
|
||||
|
||||
# create a graph
|
||||
graphviz_graph = Graph(path, engine='fdp', format='svg')
|
||||
graphviz_graph.attr('graph', overlap='false')
|
||||
|
||||
for i, node1 in enumerate(weighted_graph.nodes()):
|
||||
graphviz_graph.node(str(i), node1)
|
||||
|
||||
for j, node2 in enumerate(weighted_graph.nodes()):
|
||||
|
||||
if weighted_graph.edge_weight((node1, node2)) != weighted_graph.edge_weight((node2, node1)):
|
||||
print("THIS IS NOT SYMMETRICAL")
|
||||
|
||||
if i < j:
|
||||
original_weight = weighted_graph.edge_weight((node1, node2))
|
||||
graphviz_graph.edge(str(i), str(j), weight = str(original_weight * 10), penwidth = str(original_weight))
|
||||
|
||||
elif i > j:
|
||||
original_weight = weighted_graph.edge_weight((node2, node1))
|
||||
graphviz_graph.edge(str(j), str(i), weight = str(original_weight * 10), penwidth = str(original_weight))
|
||||
|
||||
|
||||
graphviz_graph.render(path + '_graph')
|
||||
|
||||
|
||||
# # compute the average matrix
|
||||
# matrix_keys = matrices[0].keys()
|
||||
# average_matrix = {}
|
||||
# for i in matrix_keys:
|
||||
# average_matrix[i] = {}
|
||||
# for j in matrix_keys:
|
||||
# average_matrix[i][j] = sum([matrix[i][j] for matrix in matrices]) / len(matrices)
|
||||
|
||||
# # random walk with pagerank
|
||||
# scored_sentences = custom_summarize(text, average_matrix)
|
||||
# # sort the scored sentences
|
||||
# sorted_sentences = sorted(scored_sentences, key=lambda s: s.score, reverse=True)
|
||||
|
||||
# # print them
|
||||
# print_score('average', sorted_sentences)
|
||||
|
Loading…
Reference in New Issue