#!/usr/bin/env/ python # This script unfolds the Levenhstein Distance algorithm, # often used in spellcheckers and to detect similarity between texts. # The algorithm reads a fragment on a eucalyptus tree from # Julio Cortazar's 'Historias de Cronopios y Famas', Alfaguara, 2012. # Copyright (C) 2021, Anais Berck # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details: . # Python libraries import random import math import random import textwrap import Levenshtein import sys # Functions # Open & prepare textfile for machinal reading def openfile(txt): all_text = [] with open(txt, 'r') as source: # Read the text text = source.read() # Remove punctuation characters = ['\n', '-', ':', '.', ','] for c in characters: if c in text: clean_text = text.replace(c,' ') # Transform fragment in a list of words words = clean_text.split() # Recolt all unique words of the fragment in a set of words for word in words: word = word.lower() word = word.strip() all_text.append(word) return all_text # Levenhstein Distance def levenshtein(a, b): if not a: return len(b) if not b: return len(a) return min(levenshtein(a[1:], b[1:])+(a[0] != b[0]), levenshtein(a[1:], b)+1, levenshtein(a, b[1:])+1) # Create map def space_available (grid, start, end, line): other_words = grid[line] for other_start, other_end, _ in other_words: if start < other_end and end > other_start: return False return True # Create frame def formatTable(words, cellWidth, cellHeight, padding=1, cells=6): def makeRow (height): return [''] * cellHeight def makeLine (width, text = '', fillChar=' ', padding=1): text = (padding * fillChar) + text return text.center(width - padding, ' ') + (padding * fillChar) out = "" row = makeRow(cellHeight) cell = 0 for word in words: wrapped = textwrap.wrap(word, width=cellWidth - (2 * padding)) lines = padding * [makeLine(cellWidth, padding=padding)] for line in wrapped: lines.append(makeLine(cellWidth, text=line)) for _ in range(cellHeight - len(lines)): lines.append(makeLine(cellWidth, padding=padding)) cell += 1 for (i, line) in enumerate(lines): row[i] += line if cell == cells: out += '\n'.join(row) row = makeRow(cellHeight) cell = 0 if cell > 0: out += '\n'.join(row) row = makeRow(cellHeight) return out # print statements for interaction with reader def pick_new_main_tree_terminal(fragment, all_simple, file_out=sys.stdout): print(fragment, file=file_out) ## ask input reader print("\nIn order for Levenshtein Distance to read the text and produce a unique book, you need to change one word.", file=file_out) print("\nYou can choose another tree species for the eucalyptus, the main tree in the fragment. The Levenshtein Distance will then calculate which species is in its vicinity and the more generic word 'trees' will be replaced in the fragment by this new species.", file=file_out) print("\nPlease choose the main tree from the list below:", file=file_out) print(', '.join(all_simple), file=file_out) print("\nI would like to replace eucalyptus with:", file=file_out) new_main_tree = input() while new_main_tree not in all_simple: print("Maybe you misspelled the name of the species, please try again.", file=file_out) new_main_tree = input() return new_main_tree # Levenshtein Distance between new main tree and other tree species def calculate_distances (center_tree, other_trees): # Declare dictionary object distances = {} # For every species in the list, compare the main tree to the selected species and count the similarity between both for tree in other_trees: # using the Levenhstein Distance algorithm # count = levenshtein(new_main_tree,tree) count = Levenshtein.distance(center_tree, tree) # save each compared species and its similarity count the dictionary distances[tree] = count return distances def sort_distances (distances): return sorted(distances.items(), key = lambda kv: kv[1]) # Find the minimum distance between new main tree and near species def find_nearest_species(sorted_distances, trees): # First entry in sorted_distances is the new_main_tree, distance 0 # pick the next value minimum_distance = sorted_distances[1][1] possible_trees = [] for tree in sorted_distances[1:]: if tree[1] == minimum_distance: possible_trees.append(tree) else: break near_species = random.choice(possible_trees)[0] nearest_species = trees[near_species] return (near_species, nearest_species) # rewrite fragment Cortazar def generate_new_fragment(fragment, new_main_tree, nearest_species): new_fragment = fragment.replace("eucalyptus", new_main_tree) new_fragment = new_fragment.replace("trees", nearest_species) new_fragment = new_fragment.replace("A fama is walking", "An ignorant fama is walking") return new_fragment # generate in between species and show the process of the Levenhstein Distance algorithm def generate_in_between_species (new_main_tree, near_species, file_out=sys.stdout): # Define length of main character tree and major species length_main_character_tree = len(new_main_tree) length_near_species = len(near_species) # Declare a list of in between words showing the process in_between_species = [] # Loop over every number until reaching the total lenght of the original word for cell in range(length_main_character_tree): #print('row number: ', element) # Loop over every number until reaching the total lenght of the replacement word for number in range(length_near_species): #print('column number: ', number) # select the number of letters +1 of the original word part1 = new_main_tree[:cell+1] print('The Levenshtein Distance observes a part of',new_main_tree, ':', part1, file=file_out) # select the number of letters +1 of the replacement word part2 = near_species[:number+1] print('Next it observes a part of', near_species, ':', part2, file=file_out) # selected letters of the original words replace the selected letters of the replacement word new_species = part1 + near_species[number+1:] print('In its attempt to compare it replaces the part of', near_species, 'for the part of', new_main_tree, 'and as such, it creates a new intermediary species, the', new_species, file=file_out) # add in between words to the list in_between_species.append(new_species) # calculate the similarity between in between words print('It calculates the actions needed to replace', new_main_tree[:cell+1], 'by', near_species[:number+1], ': ', Levenshtein.distance(new_main_tree[:cell+1], near_species[:number+1]), '\n', file=file_out) # print('\n', file=file_out) ## Print all in between words #print('in between species: ', in_between_species, file=file_out) return in_between_species # Draw a map of all in between species and their distance to main tree def print_map(sorted_distances, show_distances=True, file_out=sys.stdout): # As characters are less wide than high make them a bit wider xscale = 2 # Height of the map height = 70 # Width of the map width = int(70 * xscale) # Centerpoint of the map middle = (int(30 * xscale), 35) grid = [[] for x in range(height + 1)] centerpoint = sorted_distances[0][0] start = middle[0] - max(0, int(len(centerpoint) / 2)) grid[middle[1]].append((start, start + len(centerpoint), centerpoint)) for treename, distance in sorted_distances[1:]: placed = False treename = ' {}({}) '.format(treename, distance) if show_distances else ' {} '.format(treename) angle = random.random() * math.pi * 2 step = 0 steps = 180 while (not placed) and step < steps: x = int(math.floor(math.cos(angle) * (distance * 2.8)) * xscale + middle[0]) y = int(math.floor(math.sin(angle) * (distance * 2.8)) + middle[1]) start = x - max(0, int(len(treename) / 2)) end = start + len(treename) if space_available(grid, start, end, y): grid[y].append((start, end, treename)) placed = True angle += (math.pi * 2 / steps) step += 1 if not placed: print('Could not place {}'.format(treename), file=file_out) # print(angle, x, y) for row in grid: # Sort by first key of the tuples, start of the word row = sorted(row, key=lambda r: r[0]) if len(row): line = '' for start, _, treename in row: line += ((start) - len(line)) * ' ' + treename print(line, file=file_out) # draw table with all new intermediary species def print_table(new_main_tree, near_species, in_between_species, file_out=sys.stdout): ## 8. Print tabel print('{} → {}'.format(new_main_tree, near_species), file=file_out) print('', file=file_out) print(formatTable(in_between_species, 20, 5, cells=len(near_species)), file=file_out) # Execute functions if __name__ == '__main__': ## 1A. Open textfiles & turn textfiles in machine readable lists # Cortazar txt = 'eucalipto_cortazar_en.txt' # Open textfile all_text = openfile(txt) # List of trees txt1 = 'arboles_simple_en.txt' txt2 = 'arboles_plural_en.txt' all_simple = openfile(txt1) all_plural = openfile(txt2) ## 1B. Turn list of trees into dictionary of single/plural words trees = dict(zip(all_simple, all_plural)) ## 2. HOMEPAGE print statements ## print fragment Cortazar with open(txt, 'r') as source: fragment = source.read() ## 2b. Ask user to pick a new main tree new_main_tree = pick_new_main_tree_terminal(fragment, all_simple, sys.stdout) ## 3. Count the similarity between the main tree and the other species in the forest distances = calculate_distances(new_main_tree, all_simple) ## 4. Sort the distances between the trees and the new main tree from the lowest to the highest counts sorted_distances = sort_distances(distances) # Find the nearest species near_species, nearest_species = find_nearest_species(sorted_distances, trees) # Print rewritten fragment print("\n") new_fragment = generate_new_fragment(fragment, new_main_tree, nearest_species) print(new_fragment, file=sys.stdout) ## 6. Compare the similarity between the main character tree and the main species in the forest, in_between_species = generate_in_between_species(new_main_tree, near_species, file_out=sys.stdout) # Show the sorted distances dictionary # print('Sorted distances: ', sorted_distances, file=sys.stdout) # print('\n', file=sys.stdout) # generate_new_text(fragment, new_main_tree, all_simple, all_plural, trees, sys.stdout) ## 7. Generate map of the woods print_map(sorted_distances, file_out=sys.stdout) ## 8. Generate intermediary species table print_table(new_main_tree, near_species, in_between_species, sys.stdout)