You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
316 lines
11 KiB
Python
316 lines
11 KiB
Python
#!/usr/bin/env/ python
|
|
|
|
# This script unfolds the Levenhstein Distance algorithm,
|
|
# often used in spellcheckers and to detect similarity between texts.
|
|
# The algorithm reads a fragment on a eucalyptus tree from
|
|
# Julio Cortazar's 'Historias de Cronopios y Famas', Alfaguara, 2012.
|
|
|
|
# Copyright (C) 2021, Anais Berck
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details: <http://www.gnu.org/licenses/>.
|
|
|
|
# Python libraries
|
|
import random
|
|
import math
|
|
import random
|
|
import textwrap
|
|
import Levenshtein
|
|
import sys
|
|
|
|
# Functions
|
|
|
|
# Open & prepare textfile for machinal reading
|
|
def openfile(txt):
|
|
all_text = []
|
|
with open(txt, 'r') as source:
|
|
# Read the text
|
|
text = source.read()
|
|
# Remove punctuation
|
|
characters = ['\n', '-', ':', '.', ',']
|
|
for c in characters:
|
|
if c in text:
|
|
clean_text = text.replace(c,' ')
|
|
# Transform fragment in a list of words
|
|
words = clean_text.split()
|
|
# Recolt all unique words of the fragment in a set of words
|
|
for word in words:
|
|
word = word.lower()
|
|
word = word.strip()
|
|
all_text.append(word)
|
|
return all_text
|
|
|
|
# Levenhstein Distance
|
|
def levenshtein(a, b):
|
|
if not a: return len(b)
|
|
if not b: return len(a)
|
|
return min(levenshtein(a[1:], b[1:])+(a[0] != b[0]),
|
|
levenshtein(a[1:], b)+1,
|
|
levenshtein(a, b[1:])+1)
|
|
|
|
# Create map
|
|
def space_available (grid, start, end, line):
|
|
other_words = grid[line]
|
|
|
|
for other_start, other_end, _ in other_words:
|
|
if start < other_end and end > other_start:
|
|
return False
|
|
return True
|
|
|
|
# Create frame
|
|
def formatTable(words, cellWidth, cellHeight, padding=1, cells=6):
|
|
def makeRow (height):
|
|
return [''] * cellHeight
|
|
|
|
def makeLine (width, text = '', fillChar=' ', padding=1):
|
|
text = (padding * fillChar) + text
|
|
return text.center(width - padding, ' ') + (padding * fillChar)
|
|
|
|
out = ""
|
|
|
|
row = makeRow(cellHeight)
|
|
|
|
cell = 0
|
|
|
|
for word in words:
|
|
wrapped = textwrap.wrap(word, width=cellWidth - (2 * padding))
|
|
lines = padding * [makeLine(cellWidth, padding=padding)]
|
|
|
|
for line in wrapped:
|
|
lines.append(makeLine(cellWidth, text=line))
|
|
|
|
for _ in range(cellHeight - len(lines)):
|
|
lines.append(makeLine(cellWidth, padding=padding))
|
|
|
|
cell += 1
|
|
|
|
for (i, line) in enumerate(lines):
|
|
row[i] += line
|
|
|
|
if cell == cells:
|
|
out += '\n'.join(row)
|
|
row = makeRow(cellHeight)
|
|
cell = 0
|
|
|
|
if cell > 0:
|
|
out += '\n'.join(row)
|
|
row = makeRow(cellHeight)
|
|
|
|
return out
|
|
|
|
# print statements for interaction with reader
|
|
def pick_new_main_tree_terminal(fragment, all_simple, file_out=sys.stdout):
|
|
print(fragment, file=file_out)
|
|
## ask input reader
|
|
print("\nPara que La Distancia de Levenshtein pueda leer el texto y producir un libro único, necesitas cambiar una palabra.", file=file_out)
|
|
print("\nPuedes elegir otra especie de árbol para el eucalipto, el árbol principal del fragmento. La Distancia de Levenshtein calculará entonces qué especie se encuentra en su cercanía y se reemplazará en el fragmento la palabra más genérica 'árboles' por esta nueva especie.", file=file_out)
|
|
print("\nPor favor, elige el árbol principal de la lista siguiente:", file=file_out)
|
|
print(', '.join(all_simple), file=file_out)
|
|
print("\nQuiero reemplazar el eucalipto por:", file=file_out)
|
|
new_main_tree = input()
|
|
while new_main_tree not in all_simple:
|
|
print("Tal vez escribiste mal la especie, por favor intente de nuevo.", file=file_out)
|
|
new_main_tree = input()
|
|
return new_main_tree
|
|
|
|
# Levenshtein Distance between new main tree and other tree species
|
|
def calculate_distances (center_tree, other_trees):
|
|
# Declare dictionary object
|
|
distances = {}
|
|
# For every species in the list, compare the main tree to the selected species and count the similarity between both
|
|
for tree in other_trees:
|
|
# using the Levenhstein Distance algorithm
|
|
# count = levenshtein(new_main_tree,tree)
|
|
count = Levenshtein.distance(center_tree, tree)
|
|
# save each compared species and its similarity count the dictionary
|
|
distances[tree] = count
|
|
|
|
return distances
|
|
|
|
|
|
def sort_distances (distances):
|
|
return sorted(distances.items(), key = lambda kv: kv[1])
|
|
|
|
|
|
# Find the minimum distance between new main tree and near species
|
|
def find_nearest_species(sorted_distances, trees):
|
|
# First entry in sorted_distances is the new_main_tree, distance 0
|
|
# pick the next value
|
|
minimum_distance = sorted_distances[1][1]
|
|
possible_trees = []
|
|
|
|
for tree in sorted_distances[1:]:
|
|
if tree[1] == minimum_distance:
|
|
possible_trees.append(tree)
|
|
else:
|
|
break
|
|
|
|
near_species = random.choice(possible_trees)[0]
|
|
nearest_species = trees[near_species]
|
|
|
|
return (near_species, nearest_species)
|
|
|
|
# rewrite fragment Cortazar
|
|
def generate_new_fragment(fragment, new_main_tree, nearest_species):
|
|
new_fragment = fragment.replace("eucalipto", new_main_tree)
|
|
new_fragment = new_fragment.replace("árboles", nearest_species)
|
|
new_fragment = new_fragment.replace("Un fama anda", "Un fama ignorante anda")
|
|
|
|
return new_fragment
|
|
|
|
# generate in between species and show the process of the Levenhstein Distance algorithm
|
|
def generate_in_between_species (new_main_tree, near_species, file_out=sys.stdout):
|
|
# Define length of main character tree and major species
|
|
length_main_character_tree = len(new_main_tree)
|
|
length_near_species = len(near_species)
|
|
|
|
# Declare a list of in between words showing the process
|
|
in_between_species = []
|
|
# Loop over every number until reaching the total lenght of the original word
|
|
for cell in range(length_main_character_tree):
|
|
#print('row number: ', element)
|
|
# Loop over every number until reaching the total lenght of the replacement word
|
|
for number in range(length_near_species):
|
|
#print('column number: ', number)
|
|
# select the number of letters +1 of the original word
|
|
part1 = new_main_tree[:cell+1]
|
|
print('La Distancia de Levenshtein observa una parte del',new_main_tree, ':', part1, file=file_out)
|
|
# select the number of letters +1 of the replacement word
|
|
part2 = near_species[:number+1]
|
|
print('Después observa una parte del', near_species, ':', part2, file=file_out)
|
|
# selected letters of the original words replace the selected letters of the replacement word
|
|
new_species = part1 + near_species[number+1:]
|
|
print('En su intento de comparación reemplaza la parte del', near_species, 'por la parte del', new_main_tree, 'y crea así una nueva especie intermediaria: el', new_species, file=file_out)
|
|
# add in between words to the list
|
|
in_between_species.append(new_species)
|
|
# calculate the similarity between in between words
|
|
print('Calcula las acciones necesarias para reemplazar', new_main_tree[:cell+1], 'por', near_species[:number+1], ': ', Levenshtein.distance(new_main_tree[:cell+1], near_species[:number+1]), '\n', file=file_out)
|
|
# print('\n', file=file_out)
|
|
|
|
## Print all in between words
|
|
#print('in between species: ', in_between_species, file=file_out)
|
|
|
|
return in_between_species
|
|
|
|
# Draw a map of all in between species and their distance to main tree
|
|
def print_map(sorted_distances, show_distances=True, file_out=sys.stdout):
|
|
# As characters are less wide than high make them a bit wider
|
|
xscale = 2
|
|
# Height of the map
|
|
height = 70
|
|
# Width of the map
|
|
width = int(70 * xscale)
|
|
# Centerpoint of the map
|
|
middle = (int(30 * xscale), 35)
|
|
|
|
grid = [[] for x in range(height + 1)]
|
|
|
|
centerpoint = sorted_distances[0][0]
|
|
start = middle[0] - max(0, int(len(centerpoint) / 2))
|
|
|
|
grid[middle[1]].append((start, start + len(centerpoint), centerpoint))
|
|
|
|
for treename, distance in sorted_distances[1:]:
|
|
placed = False
|
|
|
|
treename = ' {}({}) '.format(treename, distance) if show_distances else ' {} '.format(treename)
|
|
angle = random.random() * math.pi * 2
|
|
step = 0
|
|
steps = 180
|
|
while (not placed) and step < steps:
|
|
|
|
x = int(math.floor(math.cos(angle) * (distance * 2.8)) * xscale + middle[0])
|
|
y = int(math.floor(math.sin(angle) * (distance * 2.8)) + middle[1])
|
|
|
|
start = x - max(0, int(len(treename) / 2))
|
|
end = start + len(treename)
|
|
|
|
if space_available(grid, start, end, y):
|
|
grid[y].append((start, end, treename))
|
|
placed = True
|
|
|
|
angle += (math.pi * 2 / steps)
|
|
step += 1
|
|
|
|
if not placed:
|
|
print('Could not place {}'.format(treename), file=file_out)
|
|
# print(angle, x, y)
|
|
|
|
for row in grid:
|
|
# Sort by first key of the tuples, start of the word
|
|
row = sorted(row, key=lambda r: r[0])
|
|
|
|
if len(row):
|
|
line = ''
|
|
for start, _, treename in row:
|
|
line += ((start) - len(line)) * ' ' + treename
|
|
|
|
print(line, file=file_out)
|
|
|
|
# draw table with all new intermediary species
|
|
def print_table(new_main_tree, near_species, in_between_species, file_out=sys.stdout):
|
|
## 8. Print tabel
|
|
print('{} → {}'.format(new_main_tree, near_species), file=file_out)
|
|
print('', file=file_out)
|
|
print(formatTable(in_between_species, 20, 5, cells=len(near_species)), file=file_out)
|
|
|
|
# Execute functions
|
|
if __name__ == '__main__':
|
|
## 1A. Open textfiles & turn textfiles in machine readable lists
|
|
|
|
# Cortazar
|
|
txt = 'eucalipto_cortazar.txt'
|
|
# Open textfile
|
|
all_text = openfile(txt)
|
|
|
|
# List of trees
|
|
txt1 = 'arboles_simple.txt'
|
|
txt2 = 'arboles_plural.txt'
|
|
all_simple = openfile(txt1)
|
|
all_plural = openfile(txt2)
|
|
|
|
## 1B. Turn list of trees into dictionary of single/plural words
|
|
trees = dict(zip(all_simple, all_plural))
|
|
|
|
## 2. HOMEPAGE print statements
|
|
## print fragment Cortazar
|
|
with open(txt, 'r') as source:
|
|
fragment = source.read()
|
|
|
|
## 2b. Ask user to pick a new main tree
|
|
new_main_tree = pick_new_main_tree_terminal(fragment, all_simple, sys.stdout)
|
|
|
|
## 3. Count the similarity between the main tree and the other species in the forest
|
|
distances = calculate_distances(new_main_tree, all_simple)
|
|
|
|
## 4. Sort the distances between the trees and the new main tree from the lowest to the highest counts
|
|
sorted_distances = sort_distances(distances)
|
|
|
|
# Find the nearest species
|
|
near_species, nearest_species = find_nearest_species(sorted_distances, trees)
|
|
|
|
# Print rewritten fragment
|
|
print("\n")
|
|
new_fragment = generate_new_fragment(fragment, new_main_tree, nearest_species)
|
|
print(new_fragment, file=sys.stdout)
|
|
|
|
## 6. Compare the similarity between the main character tree and the main species in the forest,
|
|
in_between_species = generate_in_between_species(new_main_tree, near_species, file_out=sys.stdout)
|
|
|
|
# Show the sorted distances dictionary
|
|
# print('Sorted distances: ', sorted_distances, file=sys.stdout)
|
|
# print('\n', file=sys.stdout)
|
|
|
|
# generate_new_text(fragment, new_main_tree, all_simple, all_plural, trees, sys.stdout)
|
|
## 7. Generate map of the woods
|
|
print_map(sorted_distances, file_out=sys.stdout)
|
|
|
|
## 8. Generate intermediary species table
|
|
print_table(new_main_tree, near_species, in_between_species, sys.stdout)
|