You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

316 lines
11 KiB
Python

#!/usr/bin/env/ python
# This script unfolds the Levenhstein Distance algorithm,
# often used in spellcheckers and to detect similarity between texts.
# The algorithm reads a fragment on a eucalyptus tree from
# Julio Cortazar's 'Historias de Cronopios y Famas', Alfaguara, 2012.
# Copyright (C) 2021, Anais Berck
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details: <http://www.gnu.org/licenses/>.
# Python libraries
import random
import math
import random
import textwrap
import Levenshtein
import sys
# Functions
# Open & prepare textfile for machinal reading
def openfile(txt):
all_text = []
with open(txt, 'r') as source:
# Read the text
text = source.read()
# Remove punctuation
characters = ['\n', '-', ':', '.', ',']
for c in characters:
if c in text:
clean_text = text.replace(c,' ')
# Transform fragment in a list of words
words = clean_text.split()
# Recolt all unique words of the fragment in a set of words
for word in words:
word = word.lower()
word = word.strip()
all_text.append(word)
return all_text
# Levenhstein Distance
def levenshtein(a, b):
if not a: return len(b)
if not b: return len(a)
return min(levenshtein(a[1:], b[1:])+(a[0] != b[0]),
levenshtein(a[1:], b)+1,
levenshtein(a, b[1:])+1)
# Create map
def space_available (grid, start, end, line):
other_words = grid[line]
for other_start, other_end, _ in other_words:
if start < other_end and end > other_start:
return False
return True
# Create frame
def formatTable(words, cellWidth, cellHeight, padding=1, cells=6):
def makeRow (height):
return [''] * cellHeight
def makeLine (width, text = '', fillChar=' ', padding=1):
text = (padding * fillChar) + text
return text.center(width - padding, ' ') + (padding * fillChar)
out = ""
row = makeRow(cellHeight)
cell = 0
for word in words:
wrapped = textwrap.wrap(word, width=cellWidth - (2 * padding))
lines = padding * [makeLine(cellWidth, padding=padding)]
for line in wrapped:
lines.append(makeLine(cellWidth, text=line))
for _ in range(cellHeight - len(lines)):
lines.append(makeLine(cellWidth, padding=padding))
cell += 1
for (i, line) in enumerate(lines):
row[i] += line
if cell == cells:
out += '\n'.join(row)
row = makeRow(cellHeight)
cell = 0
if cell > 0:
out += '\n'.join(row)
row = makeRow(cellHeight)
return out
# print statements for interaction with reader
def pick_new_main_tree_terminal(fragment, all_simple, file_out=sys.stdout):
print(fragment, file=file_out)
## ask input reader
print("\nIn order for Levenshtein Distance to read the text and produce a unique book, you need to change one word.", file=file_out)
print("\nYou can choose another tree species for the eucalyptus, the main tree in the fragment. The Levenshtein Distance will then calculate which species is in its vicinity and the more generic word 'trees' will be replaced in the fragment by this new species.", file=file_out)
print("\nPlease choose the main tree from the list below:", file=file_out)
print(', '.join(all_simple), file=file_out)
print("\nI would like to replace eucalyptus with:", file=file_out)
new_main_tree = input()
while new_main_tree not in all_simple:
print("Maybe you misspelled the name of the species, please try again.", file=file_out)
new_main_tree = input()
return new_main_tree
# Levenshtein Distance between new main tree and other tree species
def calculate_distances (center_tree, other_trees):
# Declare dictionary object
distances = {}
# For every species in the list, compare the main tree to the selected species and count the similarity between both
for tree in other_trees:
# using the Levenhstein Distance algorithm
# count = levenshtein(new_main_tree,tree)
count = Levenshtein.distance(center_tree, tree)
# save each compared species and its similarity count the dictionary
distances[tree] = count
return distances
def sort_distances (distances):
return sorted(distances.items(), key = lambda kv: kv[1])
# Find the minimum distance between new main tree and near species
def find_nearest_species(sorted_distances, trees):
# First entry in sorted_distances is the new_main_tree, distance 0
# pick the next value
minimum_distance = sorted_distances[1][1]
possible_trees = []
for tree in sorted_distances[1:]:
if tree[1] == minimum_distance:
possible_trees.append(tree)
else:
break
near_species = random.choice(possible_trees)[0]
nearest_species = trees[near_species]
return (near_species, nearest_species)
# rewrite fragment Cortazar
def generate_new_fragment(fragment, new_main_tree, nearest_species):
new_fragment = fragment.replace("eucalyptus", new_main_tree)
new_fragment = new_fragment.replace("trees", nearest_species)
new_fragment = new_fragment.replace("A fama is walking", "An ignorant fama is walking")
return new_fragment
# generate in between species and show the process of the Levenhstein Distance algorithm
def generate_in_between_species (new_main_tree, near_species, file_out=sys.stdout):
# Define length of main character tree and major species
length_main_character_tree = len(new_main_tree)
length_near_species = len(near_species)
# Declare a list of in between words showing the process
in_between_species = []
# Loop over every number until reaching the total lenght of the original word
for cell in range(length_main_character_tree):
#print('row number: ', element)
# Loop over every number until reaching the total lenght of the replacement word
for number in range(length_near_species):
#print('column number: ', number)
# select the number of letters +1 of the original word
part1 = new_main_tree[:cell+1]
print('The Levenshtein Distance observes a part of',new_main_tree, ':', part1, file=file_out)
# select the number of letters +1 of the replacement word
part2 = near_species[:number+1]
print('Next it observes a part of', near_species, ':', part2, file=file_out)
# selected letters of the original words replace the selected letters of the replacement word
new_species = part1 + near_species[number+1:]
print('In its attempt to compare it replaces the part of', near_species, 'for the part of', new_main_tree, 'and as such, it creates a new intermediary species, the', new_species, file=file_out)
# add in between words to the list
in_between_species.append(new_species)
# calculate the similarity between in between words
print('It calculates the actions needed to replace', new_main_tree[:cell+1], 'by', near_species[:number+1], ': ', Levenshtein.distance(new_main_tree[:cell+1], near_species[:number+1]), '\n', file=file_out)
# print('\n', file=file_out)
## Print all in between words
#print('in between species: ', in_between_species, file=file_out)
return in_between_species
# Draw a map of all in between species and their distance to main tree
def print_map(sorted_distances, show_distances=True, file_out=sys.stdout):
# As characters are less wide than high make them a bit wider
xscale = 2
# Height of the map
height = 70
# Width of the map
width = int(70 * xscale)
# Centerpoint of the map
middle = (int(30 * xscale), 35)
grid = [[] for x in range(height + 1)]
centerpoint = sorted_distances[0][0]
start = middle[0] - max(0, int(len(centerpoint) / 2))
grid[middle[1]].append((start, start + len(centerpoint), centerpoint))
for treename, distance in sorted_distances[1:]:
placed = False
treename = ' {}({}) '.format(treename, distance) if show_distances else ' {} '.format(treename)
angle = random.random() * math.pi * 2
step = 0
steps = 180
while (not placed) and step < steps:
x = int(math.floor(math.cos(angle) * (distance * 2.8)) * xscale + middle[0])
y = int(math.floor(math.sin(angle) * (distance * 2.8)) + middle[1])
start = x - max(0, int(len(treename) / 2))
end = start + len(treename)
if space_available(grid, start, end, y):
grid[y].append((start, end, treename))
placed = True
angle += (math.pi * 2 / steps)
step += 1
if not placed:
print('Could not place {}'.format(treename), file=file_out)
# print(angle, x, y)
for row in grid:
# Sort by first key of the tuples, start of the word
row = sorted(row, key=lambda r: r[0])
if len(row):
line = ''
for start, _, treename in row:
line += ((start) - len(line)) * ' ' + treename
print(line, file=file_out)
# draw table with all new intermediary species
def print_table(new_main_tree, near_species, in_between_species, file_out=sys.stdout):
## 8. Print tabel
print('{}{}'.format(new_main_tree, near_species), file=file_out)
print('', file=file_out)
print(formatTable(in_between_species, 20, 5, cells=len(near_species)), file=file_out)
# Execute functions
if __name__ == '__main__':
## 1A. Open textfiles & turn textfiles in machine readable lists
# Cortazar
txt = 'eucalipto_cortazar_en.txt'
# Open textfile
all_text = openfile(txt)
# List of trees
txt1 = 'arboles_simple_en.txt'
txt2 = 'arboles_plural_en.txt'
all_simple = openfile(txt1)
all_plural = openfile(txt2)
## 1B. Turn list of trees into dictionary of single/plural words
trees = dict(zip(all_simple, all_plural))
## 2. HOMEPAGE print statements
## print fragment Cortazar
with open(txt, 'r') as source:
fragment = source.read()
## 2b. Ask user to pick a new main tree
new_main_tree = pick_new_main_tree_terminal(fragment, all_simple, sys.stdout)
## 3. Count the similarity between the main tree and the other species in the forest
distances = calculate_distances(new_main_tree, all_simple)
## 4. Sort the distances between the trees and the new main tree from the lowest to the highest counts
sorted_distances = sort_distances(distances)
# Find the nearest species
near_species, nearest_species = find_nearest_species(sorted_distances, trees)
# Print rewritten fragment
print("\n")
new_fragment = generate_new_fragment(fragment, new_main_tree, nearest_species)
print(new_fragment, file=sys.stdout)
## 6. Compare the similarity between the main character tree and the main species in the forest,
in_between_species = generate_in_between_species(new_main_tree, near_species, file_out=sys.stdout)
# Show the sorted distances dictionary
# print('Sorted distances: ', sorted_distances, file=sys.stdout)
# print('\n', file=sys.stdout)
# generate_new_text(fragment, new_main_tree, all_simple, all_plural, trees, sys.stdout)
## 7. Generate map of the woods
print_map(sorted_distances, file_out=sys.stdout)
## 8. Generate intermediary species table
print_table(new_main_tree, near_species, in_between_species, sys.stdout)