You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
316 lines
11 KiB
Python
316 lines
11 KiB
Python
#!/usr/bin/env/ python
|
|
|
|
# This script unfolds the Levenhstein Distance algorithm,
|
|
# often used in spellcheckers and to detect similarity between texts.
|
|
# The algorithm reads a fragment on a eucalyptus tree from
|
|
# Julio Cortazar's 'Historias de Cronopios y Famas', Alfaguara, 2012.
|
|
|
|
# Copyright (C) 2021, Anais Berck
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details: <http://www.gnu.org/licenses/>.
|
|
|
|
# Python libraries
|
|
import random
|
|
import math
|
|
import random
|
|
import textwrap
|
|
import Levenshtein
|
|
import sys
|
|
|
|
# Functions
|
|
|
|
# Open & prepare textfile for machinal reading
|
|
def openfile(txt):
|
|
all_text = []
|
|
with open(txt, 'r') as source:
|
|
# Read the text
|
|
text = source.read()
|
|
# Remove punctuation
|
|
characters = ['\n', '-', ':', '.', ',']
|
|
for c in characters:
|
|
if c in text:
|
|
clean_text = text.replace(c,' ')
|
|
# Transform fragment in a list of words
|
|
words = clean_text.split()
|
|
# Recolt all unique words of the fragment in a set of words
|
|
for word in words:
|
|
word = word.lower()
|
|
word = word.strip()
|
|
all_text.append(word)
|
|
return all_text
|
|
|
|
# Levenhstein Distance
|
|
def levenshtein(a, b):
|
|
if not a: return len(b)
|
|
if not b: return len(a)
|
|
return min(levenshtein(a[1:], b[1:])+(a[0] != b[0]),
|
|
levenshtein(a[1:], b)+1,
|
|
levenshtein(a, b[1:])+1)
|
|
|
|
# Create map
|
|
def space_available (grid, start, end, line):
|
|
other_words = grid[line]
|
|
|
|
for other_start, other_end, _ in other_words:
|
|
if start < other_end and end > other_start:
|
|
return False
|
|
return True
|
|
|
|
# Create frame
|
|
def formatTable(words, cellWidth, cellHeight, padding=1, cells=6):
|
|
def makeRow (height):
|
|
return [''] * cellHeight
|
|
|
|
def makeLine (width, text = '', fillChar=' ', padding=1):
|
|
text = (padding * fillChar) + text
|
|
return text.center(width - padding, ' ') + (padding * fillChar)
|
|
|
|
out = ""
|
|
|
|
row = makeRow(cellHeight)
|
|
|
|
cell = 0
|
|
|
|
for word in words:
|
|
wrapped = textwrap.wrap(word, width=cellWidth - (2 * padding))
|
|
lines = padding * [makeLine(cellWidth, padding=padding)]
|
|
|
|
for line in wrapped:
|
|
lines.append(makeLine(cellWidth, text=line))
|
|
|
|
for _ in range(cellHeight - len(lines)):
|
|
lines.append(makeLine(cellWidth, padding=padding))
|
|
|
|
cell += 1
|
|
|
|
for (i, line) in enumerate(lines):
|
|
row[i] += line
|
|
|
|
if cell == cells:
|
|
out += '\n'.join(row)
|
|
row = makeRow(cellHeight)
|
|
cell = 0
|
|
|
|
if cell > 0:
|
|
out += '\n'.join(row)
|
|
row = makeRow(cellHeight)
|
|
|
|
return out
|
|
|
|
# print statements for interaction with reader
|
|
def pick_new_main_tree_terminal(fragment, all_simple, file_out=sys.stdout):
|
|
print(fragment, file=file_out)
|
|
## ask input reader
|
|
print("\nIn order for Levenshtein Distance to read the text and produce a unique book, you need to change one word.", file=file_out)
|
|
print("\nYou can choose another tree species for the eucalyptus, the main tree in the fragment. The Levenshtein Distance will then calculate which species is in its vicinity and the more generic word 'trees' will be replaced in the fragment by this new species.", file=file_out)
|
|
print("\nPlease choose the main tree from the list below:", file=file_out)
|
|
print(', '.join(all_simple), file=file_out)
|
|
print("\nI would like to replace eucalyptus with:", file=file_out)
|
|
new_main_tree = input()
|
|
while new_main_tree not in all_simple:
|
|
print("Maybe you misspelled the name of the species, please try again.", file=file_out)
|
|
new_main_tree = input()
|
|
return new_main_tree
|
|
|
|
# Levenshtein Distance between new main tree and other tree species
|
|
def calculate_distances (center_tree, other_trees):
|
|
# Declare dictionary object
|
|
distances = {}
|
|
# For every species in the list, compare the main tree to the selected species and count the similarity between both
|
|
for tree in other_trees:
|
|
# using the Levenhstein Distance algorithm
|
|
# count = levenshtein(new_main_tree,tree)
|
|
count = Levenshtein.distance(center_tree, tree)
|
|
# save each compared species and its similarity count the dictionary
|
|
distances[tree] = count
|
|
|
|
return distances
|
|
|
|
|
|
def sort_distances (distances):
|
|
return sorted(distances.items(), key = lambda kv: kv[1])
|
|
|
|
|
|
# Find the minimum distance between new main tree and near species
|
|
def find_nearest_species(sorted_distances, trees):
|
|
# First entry in sorted_distances is the new_main_tree, distance 0
|
|
# pick the next value
|
|
minimum_distance = sorted_distances[1][1]
|
|
possible_trees = []
|
|
|
|
for tree in sorted_distances[1:]:
|
|
if tree[1] == minimum_distance:
|
|
possible_trees.append(tree)
|
|
else:
|
|
break
|
|
|
|
near_species = random.choice(possible_trees)[0]
|
|
nearest_species = trees[near_species]
|
|
|
|
return (near_species, nearest_species)
|
|
|
|
# rewrite fragment Cortazar
|
|
def generate_new_fragment(fragment, new_main_tree, nearest_species):
|
|
new_fragment = fragment.replace("eucalyptus", new_main_tree)
|
|
new_fragment = new_fragment.replace("trees", nearest_species)
|
|
new_fragment = new_fragment.replace("A fama is walking", "An ignorant fama is walking")
|
|
|
|
return new_fragment
|
|
|
|
# generate in between species and show the process of the Levenhstein Distance algorithm
|
|
def generate_in_between_species (new_main_tree, near_species, file_out=sys.stdout):
|
|
# Define length of main character tree and major species
|
|
length_main_character_tree = len(new_main_tree)
|
|
length_near_species = len(near_species)
|
|
|
|
# Declare a list of in between words showing the process
|
|
in_between_species = []
|
|
# Loop over every number until reaching the total lenght of the original word
|
|
for cell in range(length_main_character_tree):
|
|
#print('row number: ', element)
|
|
# Loop over every number until reaching the total lenght of the replacement word
|
|
for number in range(length_near_species):
|
|
#print('column number: ', number)
|
|
# select the number of letters +1 of the original word
|
|
part1 = new_main_tree[:cell+1]
|
|
print('The Levenshtein Distance observes a part of',new_main_tree, ':', part1, file=file_out)
|
|
# select the number of letters +1 of the replacement word
|
|
part2 = near_species[:number+1]
|
|
print('Next it observes a part of', near_species, ':', part2, file=file_out)
|
|
# selected letters of the original words replace the selected letters of the replacement word
|
|
new_species = part1 + near_species[number+1:]
|
|
print('In its attempt to compare it replaces the part of', near_species, 'for the part of', new_main_tree, 'and as such, it creates a new intermediary species, the', new_species, file=file_out)
|
|
# add in between words to the list
|
|
in_between_species.append(new_species)
|
|
# calculate the similarity between in between words
|
|
print('It calculates the actions needed to replace', new_main_tree[:cell+1], 'by', near_species[:number+1], ': ', Levenshtein.distance(new_main_tree[:cell+1], near_species[:number+1]), '\n', file=file_out)
|
|
# print('\n', file=file_out)
|
|
|
|
## Print all in between words
|
|
#print('in between species: ', in_between_species, file=file_out)
|
|
|
|
return in_between_species
|
|
|
|
# Draw a map of all in between species and their distance to main tree
|
|
def print_map(sorted_distances, show_distances=True, file_out=sys.stdout):
|
|
# As characters are less wide than high make them a bit wider
|
|
xscale = 2
|
|
# Height of the map
|
|
height = 70
|
|
# Width of the map
|
|
width = int(70 * xscale)
|
|
# Centerpoint of the map
|
|
middle = (int(30 * xscale), 35)
|
|
|
|
grid = [[] for x in range(height + 1)]
|
|
|
|
centerpoint = sorted_distances[0][0]
|
|
start = middle[0] - max(0, int(len(centerpoint) / 2))
|
|
|
|
grid[middle[1]].append((start, start + len(centerpoint), centerpoint))
|
|
|
|
for treename, distance in sorted_distances[1:]:
|
|
placed = False
|
|
|
|
treename = ' {}({}) '.format(treename, distance) if show_distances else ' {} '.format(treename)
|
|
angle = random.random() * math.pi * 2
|
|
step = 0
|
|
steps = 180
|
|
while (not placed) and step < steps:
|
|
|
|
x = int(math.floor(math.cos(angle) * (distance * 2.8)) * xscale + middle[0])
|
|
y = int(math.floor(math.sin(angle) * (distance * 2.8)) + middle[1])
|
|
|
|
start = x - max(0, int(len(treename) / 2))
|
|
end = start + len(treename)
|
|
|
|
if space_available(grid, start, end, y):
|
|
grid[y].append((start, end, treename))
|
|
placed = True
|
|
|
|
angle += (math.pi * 2 / steps)
|
|
step += 1
|
|
|
|
if not placed:
|
|
print('Could not place {}'.format(treename), file=file_out)
|
|
# print(angle, x, y)
|
|
|
|
for row in grid:
|
|
# Sort by first key of the tuples, start of the word
|
|
row = sorted(row, key=lambda r: r[0])
|
|
|
|
if len(row):
|
|
line = ''
|
|
for start, _, treename in row:
|
|
line += ((start) - len(line)) * ' ' + treename
|
|
|
|
print(line, file=file_out)
|
|
|
|
# draw table with all new intermediary species
|
|
def print_table(new_main_tree, near_species, in_between_species, file_out=sys.stdout):
|
|
## 8. Print tabel
|
|
print('{} → {}'.format(new_main_tree, near_species), file=file_out)
|
|
print('', file=file_out)
|
|
print(formatTable(in_between_species, 20, 5, cells=len(near_species)), file=file_out)
|
|
|
|
# Execute functions
|
|
if __name__ == '__main__':
|
|
## 1A. Open textfiles & turn textfiles in machine readable lists
|
|
|
|
# Cortazar
|
|
txt = 'eucalipto_cortazar_en.txt'
|
|
# Open textfile
|
|
all_text = openfile(txt)
|
|
|
|
# List of trees
|
|
txt1 = 'arboles_simple_en.txt'
|
|
txt2 = 'arboles_plural_en.txt'
|
|
all_simple = openfile(txt1)
|
|
all_plural = openfile(txt2)
|
|
|
|
## 1B. Turn list of trees into dictionary of single/plural words
|
|
trees = dict(zip(all_simple, all_plural))
|
|
|
|
## 2. HOMEPAGE print statements
|
|
## print fragment Cortazar
|
|
with open(txt, 'r') as source:
|
|
fragment = source.read()
|
|
|
|
## 2b. Ask user to pick a new main tree
|
|
new_main_tree = pick_new_main_tree_terminal(fragment, all_simple, sys.stdout)
|
|
|
|
## 3. Count the similarity between the main tree and the other species in the forest
|
|
distances = calculate_distances(new_main_tree, all_simple)
|
|
|
|
## 4. Sort the distances between the trees and the new main tree from the lowest to the highest counts
|
|
sorted_distances = sort_distances(distances)
|
|
|
|
# Find the nearest species
|
|
near_species, nearest_species = find_nearest_species(sorted_distances, trees)
|
|
|
|
# Print rewritten fragment
|
|
print("\n")
|
|
new_fragment = generate_new_fragment(fragment, new_main_tree, nearest_species)
|
|
print(new_fragment, file=sys.stdout)
|
|
|
|
## 6. Compare the similarity between the main character tree and the main species in the forest,
|
|
in_between_species = generate_in_between_species(new_main_tree, near_species, file_out=sys.stdout)
|
|
|
|
# Show the sorted distances dictionary
|
|
# print('Sorted distances: ', sorted_distances, file=sys.stdout)
|
|
# print('\n', file=sys.stdout)
|
|
|
|
# generate_new_text(fragment, new_main_tree, all_simple, all_plural, trees, sys.stdout)
|
|
## 7. Generate map of the woods
|
|
print_map(sorted_distances, file_out=sys.stdout)
|
|
|
|
## 8. Generate intermediary species table
|
|
print_table(new_main_tree, near_species, in_between_species, sys.stdout)
|