new experimentation on sensible simalirity metrics between 2 sentences
parent
40ea2cae20
commit
e69058624e
@ -0,0 +1,91 @@
|
|||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import glob
|
||||||
|
from graphviz import Graph
|
||||||
|
|
||||||
|
# appending a path
|
||||||
|
sys.path.append('../')
|
||||||
|
|
||||||
|
# importing customised module
|
||||||
|
import summa.edits
|
||||||
|
from summa.edits import csv_to_matrix, custom_summarize
|
||||||
|
|
||||||
|
|
||||||
|
# main
|
||||||
|
# ------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def print_score(name, scored_sentences):
|
||||||
|
print(name)
|
||||||
|
for s in sorted_sentences:
|
||||||
|
print(s.text)
|
||||||
|
print(s.score)
|
||||||
|
print()
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
paths = glob.glob('../summa/data/*.csv')
|
||||||
|
|
||||||
|
matrices = []
|
||||||
|
|
||||||
|
for path in paths:
|
||||||
|
|
||||||
|
# CSV 2 MATRIC
|
||||||
|
matrix = csv_to_matrix(path)
|
||||||
|
matrices.append(matrix)
|
||||||
|
# print(json.dumps(matrix, indent=4))
|
||||||
|
|
||||||
|
# we give summa the whole text to tokenize, from the csv keys
|
||||||
|
text = " ".join(matrix.keys())
|
||||||
|
|
||||||
|
# random walk with pagerank
|
||||||
|
scored_sentences, weighted_graph = custom_summarize(text, matrix)
|
||||||
|
|
||||||
|
# sort the scored sentences
|
||||||
|
sorted_sentences = sorted(scored_sentences, key=lambda s: s.score, reverse=True)
|
||||||
|
|
||||||
|
# print them
|
||||||
|
print_score(path, sorted_sentences)
|
||||||
|
|
||||||
|
# create a graph
|
||||||
|
graphviz_graph = Graph(path, engine='fdp', format='svg')
|
||||||
|
graphviz_graph.attr('graph', overlap='false')
|
||||||
|
|
||||||
|
for i, node1 in enumerate(weighted_graph.nodes()):
|
||||||
|
graphviz_graph.node(str(i), node1)
|
||||||
|
|
||||||
|
for j, node2 in enumerate(weighted_graph.nodes()):
|
||||||
|
|
||||||
|
if weighted_graph.edge_weight((node1, node2)) != weighted_graph.edge_weight((node2, node1)):
|
||||||
|
print("THIS IS NOT SYMMETRICAL")
|
||||||
|
|
||||||
|
if i < j:
|
||||||
|
original_weight = weighted_graph.edge_weight((node1, node2))
|
||||||
|
graphviz_graph.edge(str(i), str(j), weight = str(original_weight * 10), penwidth = str(original_weight))
|
||||||
|
|
||||||
|
elif i > j:
|
||||||
|
original_weight = weighted_graph.edge_weight((node2, node1))
|
||||||
|
graphviz_graph.edge(str(j), str(i), weight = str(original_weight * 10), penwidth = str(original_weight))
|
||||||
|
|
||||||
|
|
||||||
|
graphviz_graph.render(path + '_graph')
|
||||||
|
|
||||||
|
|
||||||
|
# # compute the average matrix
|
||||||
|
# matrix_keys = matrices[0].keys()
|
||||||
|
# average_matrix = {}
|
||||||
|
# for i in matrix_keys:
|
||||||
|
# average_matrix[i] = {}
|
||||||
|
# for j in matrix_keys:
|
||||||
|
# average_matrix[i][j] = sum([matrix[i][j] for matrix in matrices]) / len(matrices)
|
||||||
|
|
||||||
|
# # random walk with pagerank
|
||||||
|
# scored_sentences = custom_summarize(text, average_matrix)
|
||||||
|
# # sort the scored sentences
|
||||||
|
# sorted_sentences = sorted(scored_sentences, key=lambda s: s.score, reverse=True)
|
||||||
|
|
||||||
|
# # print them
|
||||||
|
# print_score('average', sorted_sentences)
|
||||||
|
|
Loading…
Reference in New Issue