#!/usr/bin/env/ python # This script unfolds the Levenhstein Distance algorithm, # often used in spellcheckers and to detect similarity between texts. # The algorithm reads a fragment on a eucalyptus tree from # Julio Cortazar's 'Historias de Cronopios y Famas', Alfaguara, 2012. # Copyright (C) 2021, Anais Berck # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details: . # Python libraries import random import math import random import textwrap import Levenshtein import sys # Functions # Open & prepare textfile for machinal reading def openfile(txt): all_text = [] with open(txt, 'r') as source: # Read the text text = source.read() # Remove punctuation characters = ['\n', '-', ':', '.', ','] for c in characters: if c in text: clean_text = text.replace(c,' ') # Transform fragment in a list of words words = clean_text.split() # Recolt all unique words of the fragment in a set of words for word in words: word = word.lower() word = word.strip() all_text.append(word) return all_text # Levenhstein Distance def levenshtein(a, b): if not a: return len(b) if not b: return len(a) return min(levenshtein(a[1:], b[1:])+(a[0] != b[0]), levenshtein(a[1:], b)+1, levenshtein(a, b[1:])+1) # Create map def space_available (grid, start, end, line): other_words = grid[line] for other_start, other_end, _ in other_words: if start < other_end and end > other_start: return False return True # Create frame def formatTable(words, cellWidth, cellHeight, padding=1, cells=6): def makeRow (height): return [''] * cellHeight def makeLine (width, text = '', fillChar=' ', padding=1): text = (padding * fillChar) + text return text.center(width - padding, ' ') + (padding * fillChar) out = "" row = makeRow(cellHeight) cell = 0 for word in words: wrapped = textwrap.wrap(word, width=cellWidth - (2 * padding)) lines = padding * [makeLine(cellWidth, padding=padding)] for line in wrapped: lines.append(makeLine(cellWidth, text=line)) for _ in range(cellHeight - len(lines)): lines.append(makeLine(cellWidth, padding=padding)) cell += 1 for (i, line) in enumerate(lines): row[i] += line if cell == cells: out += '\n'.join(row) row = makeRow(cellHeight) cell = 0 if cell > 0: out += '\n'.join(row) row = makeRow(cellHeight) return out # print statements for interaction with reader def pick_new_main_tree_terminal(fragment, all_simple, file_out=sys.stdout): print(fragment, file=file_out) ## ask input reader print("\nPara que La Distancia de Levenshtein pueda leer el texto y producir un libro único, necesitas cambiar una palabra.", file=file_out) print("\nPuedes elegir otra especie de árbol para el eucalipto, el árbol principal del fragmento. La Distancia de Levenshtein calculará entonces qué especie se encuentra en su cercanía y se reemplazará en el fragmento la palabra más genérica 'árboles' por esta nueva especie.", file=file_out) print("\nPor favor, elige el árbol principal de la lista siguiente:", file=file_out) print(', '.join(all_simple), file=file_out) print("\nQuiero reemplazar el eucalipto por:", file=file_out) new_main_tree = input() while new_main_tree not in all_simple: print("Tal vez escribiste mal la especie, por favor intente de nuevo.", file=file_out) new_main_tree = input() return new_main_tree # Levenshtein Distance between new main tree and other tree species def calculate_distances (center_tree, other_trees): # Declare dictionary object distances = {} # For every species in the list, compare the main tree to the selected species and count the similarity between both for tree in other_trees: # using the Levenhstein Distance algorithm # count = levenshtein(new_main_tree,tree) count = Levenshtein.distance(center_tree, tree) # save each compared species and its similarity count the dictionary distances[tree] = count return distances def sort_distances (distances): return sorted(distances.items(), key = lambda kv: kv[1]) # Find the minimum distance between new main tree and near species def find_nearest_species(sorted_distances, trees): # First entry in sorted_distances is the new_main_tree, distance 0 # pick the next value minimum_distance = sorted_distances[1][1] possible_trees = [] for tree in sorted_distances[1:]: if tree[1] == minimum_distance: possible_trees.append(tree) else: break near_species = random.choice(possible_trees)[0] nearest_species = trees[near_species] return (near_species, nearest_species) # rewrite fragment Cortazar def generate_new_fragment(fragment, new_main_tree, nearest_species): new_fragment = fragment.replace("eucalipto", new_main_tree) new_fragment = new_fragment.replace("árboles", nearest_species) new_fragment = new_fragment.replace("Un fama anda", "Un fama ignorante anda") return new_fragment # generate in between species and show the process of the Levenhstein Distance algorithm def generate_in_between_species (new_main_tree, near_species, file_out=sys.stdout): # Define length of main character tree and major species length_main_character_tree = len(new_main_tree) length_near_species = len(near_species) # Declare a list of in between words showing the process in_between_species = [] # Loop over every number until reaching the total lenght of the original word for cell in range(length_main_character_tree): #print('row number: ', element) # Loop over every number until reaching the total lenght of the replacement word for number in range(length_near_species): #print('column number: ', number) # select the number of letters +1 of the original word part1 = new_main_tree[:cell+1] print('La Distancia de Levenshtein observa una parte del',new_main_tree, ':', part1, file=file_out) # select the number of letters +1 of the replacement word part2 = near_species[:number+1] print('Después observa una parte del', near_species, ':', part2, file=file_out) # selected letters of the original words replace the selected letters of the replacement word new_species = part1 + near_species[number+1:] print('En su intento de comparación reemplaza la parte del', near_species, 'por la parte del', new_main_tree, 'y crea así una nueva especie intermediaria: el', new_species, file=file_out) # add in between words to the list in_between_species.append(new_species) # calculate the similarity between in between words print('Calcula las acciones necesarias para reemplazar', new_main_tree[:cell+1], 'por', near_species[:number+1], ': ', Levenshtein.distance(new_main_tree[:cell+1], near_species[:number+1]), '\n', file=file_out) # print('\n', file=file_out) ## Print all in between words #print('in between species: ', in_between_species, file=file_out) return in_between_species # Draw a map of all in between species and their distance to main tree def print_map(sorted_distances, show_distances=True, file_out=sys.stdout): # As characters are less wide than high make them a bit wider xscale = 2 # Height of the map height = 70 # Width of the map width = int(70 * xscale) # Centerpoint of the map middle = (int(30 * xscale), 35) grid = [[] for x in range(height + 1)] centerpoint = sorted_distances[0][0] start = middle[0] - max(0, int(len(centerpoint) / 2)) grid[middle[1]].append((start, start + len(centerpoint), centerpoint)) for treename, distance in sorted_distances[1:]: placed = False treename = ' {}({}) '.format(treename, distance) if show_distances else ' {} '.format(treename) angle = random.random() * math.pi * 2 step = 0 steps = 180 while (not placed) and step < steps: x = int(math.floor(math.cos(angle) * (distance * 2.8)) * xscale + middle[0]) y = int(math.floor(math.sin(angle) * (distance * 2.8)) + middle[1]) start = x - max(0, int(len(treename) / 2)) end = start + len(treename) if space_available(grid, start, end, y): grid[y].append((start, end, treename)) placed = True angle += (math.pi * 2 / steps) step += 1 if not placed: print('Could not place {}'.format(treename), file=file_out) # print(angle, x, y) for row in grid: # Sort by first key of the tuples, start of the word row = sorted(row, key=lambda r: r[0]) if len(row): line = '' for start, _, treename in row: line += ((start) - len(line)) * ' ' + treename print(line, file=file_out) # draw table with all new intermediary species def print_table(new_main_tree, near_species, in_between_species, file_out=sys.stdout): ## 8. Print tabel print('{} → {}'.format(new_main_tree, near_species), file=file_out) print('', file=file_out) print(formatTable(in_between_species, 20, 5, cells=len(near_species)), file=file_out) # Execute functions if __name__ == '__main__': ## 1A. Open textfiles & turn textfiles in machine readable lists # Cortazar txt = 'eucalipto_cortazar.txt' # Open textfile all_text = openfile(txt) # List of trees txt1 = 'arboles_simple.txt' txt2 = 'arboles_plural.txt' all_simple = openfile(txt1) all_plural = openfile(txt2) ## 1B. Turn list of trees into dictionary of single/plural words trees = dict(zip(all_simple, all_plural)) ## 2. HOMEPAGE print statements ## print fragment Cortazar with open(txt, 'r') as source: fragment = source.read() ## 2b. Ask user to pick a new main tree new_main_tree = pick_new_main_tree_terminal(fragment, all_simple, sys.stdout) ## 3. Count the similarity between the main tree and the other species in the forest distances = calculate_distances(new_main_tree, all_simple) ## 4. Sort the distances between the trees and the new main tree from the lowest to the highest counts sorted_distances = sort_distances(distances) # Find the nearest species near_species, nearest_species = find_nearest_species(sorted_distances, trees) # Print rewritten fragment print("\n") new_fragment = generate_new_fragment(fragment, new_main_tree, nearest_species) print(new_fragment, file=sys.stdout) ## 6. Compare the similarity between the main character tree and the main species in the forest, in_between_species = generate_in_between_species(new_main_tree, near_species, file_out=sys.stdout) # Show the sorted distances dictionary # print('Sorted distances: ', sorted_distances, file=sys.stdout) # print('\n', file=sys.stdout) # generate_new_text(fragment, new_main_tree, all_simple, all_plural, trees, sys.stdout) ## 7. Generate map of the woods print_map(sorted_distances, file_out=sys.stdout) ## 8. Generate intermediary species table print_table(new_main_tree, near_species, in_between_species, sys.stdout)