You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

316 lines
11 KiB
Python

#!/usr/bin/env/ python
# This script unfolds the Levenhstein Distance algorithm,
# often used in spellcheckers and to detect similarity between texts.
# The algorithm reads a fragment on a eucalyptus tree from
# Julio Cortazar's 'Historias de Cronopios y Famas', Alfaguara, 2012.
# Copyright (C) 2021, Anais Berck
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details: <http://www.gnu.org/licenses/>.
# Python libraries
import random
import math
import random
import textwrap
import Levenshtein
import sys
# Functions
# Open & prepare textfile for machinal reading
def openfile(txt):
all_text = []
with open(txt, 'r') as source:
# Read the text
text = source.read()
# Remove punctuation
characters = ['\n', '-', ':', '.', ',']
for c in characters:
if c in text:
clean_text = text.replace(c,' ')
# Transform fragment in a list of words
words = clean_text.split()
# Recolt all unique words of the fragment in a set of words
for word in words:
word = word.lower()
word = word.strip()
all_text.append(word)
return all_text
# Levenhstein Distance
def levenshtein(a, b):
if not a: return len(b)
if not b: return len(a)
return min(levenshtein(a[1:], b[1:])+(a[0] != b[0]),
levenshtein(a[1:], b)+1,
levenshtein(a, b[1:])+1)
# Create map
def space_available (grid, start, end, line):
other_words = grid[line]
for other_start, other_end, _ in other_words:
if start < other_end and end > other_start:
return False
return True
# Create frame
def formatTable(words, cellWidth, cellHeight, padding=1, cells=6):
def makeRow (height):
return [''] * cellHeight
def makeLine (width, text = '', fillChar=' ', padding=1):
text = (padding * fillChar) + text
return text.center(width - padding, ' ') + (padding * fillChar)
out = ""
row = makeRow(cellHeight)
cell = 0
for word in words:
wrapped = textwrap.wrap(word, width=cellWidth - (2 * padding))
lines = padding * [makeLine(cellWidth, padding=padding)]
for line in wrapped:
lines.append(makeLine(cellWidth, text=line))
for _ in range(cellHeight - len(lines)):
lines.append(makeLine(cellWidth, padding=padding))
cell += 1
for (i, line) in enumerate(lines):
row[i] += line
if cell == cells:
out += '\n'.join(row)
row = makeRow(cellHeight)
cell = 0
if cell > 0:
out += '\n'.join(row)
row = makeRow(cellHeight)
return out
# print statements for interaction with reader
def pick_new_main_tree_terminal(fragment, all_simple, file_out=sys.stdout):
print(fragment, file=file_out)
## ask input reader
print("\nPara que La Distancia de Levenshtein pueda leer el texto y producir un libro único, necesitas cambiar una palabra.", file=file_out)
print("\nPuedes elegir otra especie de árbol para el eucalipto, el árbol principal del fragmento. La Distancia de Levenshtein calculará entonces qué especie se encuentra en su cercanía y se reemplazará en el fragmento la palabra más genérica 'árboles' por esta nueva especie.", file=file_out)
print("\nPor favor, elige el árbol principal de la lista siguiente:", file=file_out)
print(', '.join(all_simple), file=file_out)
print("\nQuiero reemplazar el eucalipto por:", file=file_out)
new_main_tree = input()
while new_main_tree not in all_simple:
print("Tal vez escribiste mal la especie, por favor intente de nuevo.", file=file_out)
new_main_tree = input()
return new_main_tree
# Levenshtein Distance between new main tree and other tree species
def calculate_distances (center_tree, other_trees):
# Declare dictionary object
distances = {}
# For every species in the list, compare the main tree to the selected species and count the similarity between both
for tree in other_trees:
# using the Levenhstein Distance algorithm
# count = levenshtein(new_main_tree,tree)
count = Levenshtein.distance(center_tree, tree)
# save each compared species and its similarity count the dictionary
distances[tree] = count
return distances
def sort_distances (distances):
return sorted(distances.items(), key = lambda kv: kv[1])
# Find the minimum distance between new main tree and near species
def find_nearest_species(sorted_distances, trees):
# First entry in sorted_distances is the new_main_tree, distance 0
# pick the next value
minimum_distance = sorted_distances[1][1]
possible_trees = []
for tree in sorted_distances[1:]:
if tree[1] == minimum_distance:
possible_trees.append(tree)
else:
break
near_species = random.choice(possible_trees)[0]
nearest_species = trees[near_species]
return (near_species, nearest_species)
# rewrite fragment Cortazar
def generate_new_fragment(fragment, new_main_tree, nearest_species):
new_fragment = fragment.replace("eucalipto", new_main_tree)
new_fragment = new_fragment.replace("árboles", nearest_species)
new_fragment = new_fragment.replace("Un fama anda", "Un fama ignorante anda")
return new_fragment
# generate in between species and show the process of the Levenhstein Distance algorithm
def generate_in_between_species (new_main_tree, near_species, file_out=sys.stdout):
# Define length of main character tree and major species
length_main_character_tree = len(new_main_tree)
length_near_species = len(near_species)
# Declare a list of in between words showing the process
in_between_species = []
# Loop over every number until reaching the total lenght of the original word
for cell in range(length_main_character_tree):
#print('row number: ', element)
# Loop over every number until reaching the total lenght of the replacement word
for number in range(length_near_species):
#print('column number: ', number)
# select the number of letters +1 of the original word
part1 = new_main_tree[:cell+1]
print('La Distancia de Levenshtein observa una parte del',new_main_tree, ':', part1, file=file_out)
# select the number of letters +1 of the replacement word
part2 = near_species[:number+1]
print('Después observa una parte del', near_species, ':', part2, file=file_out)
# selected letters of the original words replace the selected letters of the replacement word
new_species = part1 + near_species[number+1:]
print('En su intento de comparación reemplaza la parte del', near_species, 'por la parte del', new_main_tree, 'y crea así una nueva especie intermediaria: el', new_species, file=file_out)
# add in between words to the list
in_between_species.append(new_species)
# calculate the similarity between in between words
print('Calcula las acciones necesarias para reemplazar', new_main_tree[:cell+1], 'por', near_species[:number+1], ': ', Levenshtein.distance(new_main_tree[:cell+1], near_species[:number+1]), '\n', file=file_out)
# print('\n', file=file_out)
## Print all in between words
#print('in between species: ', in_between_species, file=file_out)
return in_between_species
# Draw a map of all in between species and their distance to main tree
def print_map(sorted_distances, show_distances=True, file_out=sys.stdout):
# As characters are less wide than high make them a bit wider
xscale = 2
# Height of the map
height = 70
# Width of the map
width = int(70 * xscale)
# Centerpoint of the map
middle = (int(30 * xscale), 35)
grid = [[] for x in range(height + 1)]
centerpoint = sorted_distances[0][0]
start = middle[0] - max(0, int(len(centerpoint) / 2))
grid[middle[1]].append((start, start + len(centerpoint), centerpoint))
for treename, distance in sorted_distances[1:]:
placed = False
treename = ' {}({}) '.format(treename, distance) if show_distances else ' {} '.format(treename)
angle = random.random() * math.pi * 2
step = 0
steps = 180
while (not placed) and step < steps:
x = int(math.floor(math.cos(angle) * (distance * 2.8)) * xscale + middle[0])
y = int(math.floor(math.sin(angle) * (distance * 2.8)) + middle[1])
start = x - max(0, int(len(treename) / 2))
end = start + len(treename)
if space_available(grid, start, end, y):
grid[y].append((start, end, treename))
placed = True
angle += (math.pi * 2 / steps)
step += 1
if not placed:
print('Could not place {}'.format(treename), file=file_out)
# print(angle, x, y)
for row in grid:
# Sort by first key of the tuples, start of the word
row = sorted(row, key=lambda r: r[0])
if len(row):
line = ''
for start, _, treename in row:
line += ((start) - len(line)) * ' ' + treename
print(line, file=file_out)
# draw table with all new intermediary species
def print_table(new_main_tree, near_species, in_between_species, file_out=sys.stdout):
## 8. Print tabel
print('{}{}'.format(new_main_tree, near_species), file=file_out)
print('', file=file_out)
print(formatTable(in_between_species, 20, 5, cells=len(near_species)), file=file_out)
# Execute functions
if __name__ == '__main__':
## 1A. Open textfiles & turn textfiles in machine readable lists
# Cortazar
txt = 'eucalipto_cortazar.txt'
# Open textfile
all_text = openfile(txt)
# List of trees
txt1 = 'arboles_simple.txt'
txt2 = 'arboles_plural.txt'
all_simple = openfile(txt1)
all_plural = openfile(txt2)
## 1B. Turn list of trees into dictionary of single/plural words
trees = dict(zip(all_simple, all_plural))
## 2. HOMEPAGE print statements
## print fragment Cortazar
with open(txt, 'r') as source:
fragment = source.read()
## 2b. Ask user to pick a new main tree
new_main_tree = pick_new_main_tree_terminal(fragment, all_simple, sys.stdout)
## 3. Count the similarity between the main tree and the other species in the forest
distances = calculate_distances(new_main_tree, all_simple)
## 4. Sort the distances between the trees and the new main tree from the lowest to the highest counts
sorted_distances = sort_distances(distances)
# Find the nearest species
near_species, nearest_species = find_nearest_species(sorted_distances, trees)
# Print rewritten fragment
print("\n")
new_fragment = generate_new_fragment(fragment, new_main_tree, nearest_species)
print(new_fragment, file=sys.stdout)
## 6. Compare the similarity between the main character tree and the main species in the forest,
in_between_species = generate_in_between_species(new_main_tree, near_species, file_out=sys.stdout)
# Show the sorted distances dictionary
# print('Sorted distances: ', sorted_distances, file=sys.stdout)
# print('\n', file=sys.stdout)
# generate_new_text(fragment, new_main_tree, all_simple, all_plural, trees, sys.stdout)
## 7. Generate map of the woods
print_map(sorted_distances, file_out=sys.stdout)
## 8. Generate intermediary species table
print_table(new_main_tree, near_species, in_between_species, sys.stdout)