Levenshtein-Distance-lee-a-.../la_distancia_de_levenshtein...

#!/usr/bin/env/ python

#    This script unfolds the Levenhstein Distance algorithm,
#    often used in spellcheckers and to detect similarity between texts.
#    The algorithm reads a fragment on a eucalyptus tree from
#    Julio Cortazar's 'Historias de Cronopios y Famas', Alfaguara, 2012.

#    Copyright (C) 2021, Anais Berck
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.

#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details: <http://www.gnu.org/licenses/>.

# Python libraries
import random
import math
import random
import textwrap
import Levenshtein
import sys

# Functions

# Open & prepare textfile for machinal reading
def openfile(txt):
	all_text = []
	with open(txt, 'r') as source:
		# Read the text
		text = source.read()
		# Remove punctuation
		characters = ['\n', '-', ':', '.', ',']
		for c in characters:
			if c in text:
				clean_text = text.replace(c,' ')
		# Transform fragment in a list of words
		words = clean_text.split()
		# Recolt all unique words of the fragment in a set of words
		for word in words:
			word = word.lower()
			word = word.strip()
			all_text.append(word)
	return all_text

# Levenhstein Distance
def levenshtein(a, b):
	if not a: return len(b)
	if not b: return len(a)
	return min(levenshtein(a[1:], b[1:])+(a[0] != b[0]),
			   levenshtein(a[1:], b)+1,
			   levenshtein(a, b[1:])+1)

# Create map
def space_available (grid, start, end, line):
	other_words = grid[line]

	for other_start, other_end, _ in other_words:
		if start < other_end and end > other_start:
			return False
	return True

# Create frame
def formatTable(words, cellWidth, cellHeight, padding=1, cells=6):
	def makeRow (height):
		return [''] * cellHeight

	def makeLine (width, text = '', fillChar=' ', padding=1):
		text = (padding * fillChar) + text
		return text.center(width - padding, ' ') + (padding * fillChar)

	out = ""

	row = makeRow(cellHeight)

	cell = 0

	for word in words:
		wrapped = textwrap.wrap(word, width=cellWidth - (2 * padding))
		lines = padding * [makeLine(cellWidth, padding=padding)]

		for line in wrapped:
			lines.append(makeLine(cellWidth, text=line))

		for _ in range(cellHeight - len(lines)):
			lines.append(makeLine(cellWidth, padding=padding))

		cell += 1

		for (i, line) in enumerate(lines):
			row[i] += line

		if cell == cells:
			out += '\n'.join(row)
			row = makeRow(cellHeight)
			cell = 0

	if cell > 0:
		out += '\n'.join(row)
		row = makeRow(cellHeight)

	return out

# print statements for interaction with reader
def pick_new_main_tree_terminal(fragment, all_simple, file_out=sys.stdout):
	print(fragment, file=file_out)
	## ask input reader
	print("\nPara que La Distancia de Levenshtein pueda leer el texto y producir un libro único, necesitas cambiar una palabra.", file=file_out)
	print("\nPuedes elegir otra especie de árbol para el eucalipto, el árbol principal del fragmento. La Distancia de Levenshtein calculará entonces qué especie se encuentra en su cercanía y se reemplazará en el fragmento la palabra más genérica 'árboles' por esta nueva especie.", file=file_out)
	print("\nPor favor, elige el árbol principal de la lista siguiente:", file=file_out)
	print(', '.join(all_simple), file=file_out)
	print("\nQuiero reemplazar el eucalipto por:", file=file_out)
	new_main_tree = input()
	while new_main_tree not in all_simple:
			print("Tal vez escribiste mal la especie, por favor intente de nuevo.", file=file_out)
			new_main_tree = input()
	return new_main_tree

# Levenshtein Distance between new main tree and other tree species
def calculate_distances (center_tree, other_trees):
	# Declare dictionary object
	distances = {}
	# For every species in the list, compare the main tree to the selected species and count the similarity between both
	for tree in other_trees:
		# using the Levenhstein Distance algorithm
		# count = levenshtein(new_main_tree,tree)
		count = Levenshtein.distance(center_tree, tree)
		# save each compared species and its similarity count the dictionary
		distances[tree] = count

	return distances


def sort_distances (distances):
	return sorted(distances.items(), key = lambda kv: kv[1])


# Find the minimum distance between new main tree and near species
def find_nearest_species(sorted_distances, trees):
	# First entry in sorted_distances is the new_main_tree, distance 0
	# pick the next value
	minimum_distance = sorted_distances[1][1]
	possible_trees = []

	for tree in sorted_distances[1:]:
		if tree[1] == minimum_distance:
			possible_trees.append(tree)
		else:
			break

	near_species = random.choice(possible_trees)[0]
	nearest_species = trees[near_species]

	return (near_species, nearest_species)

# rewrite fragment Cortazar
def generate_new_fragment(fragment, new_main_tree, nearest_species):
	new_fragment = fragment.replace("eucalipto", new_main_tree)
	new_fragment = new_fragment.replace("árboles", nearest_species)
	new_fragment = new_fragment.replace("Un fama anda", "Un fama ignorante anda")

	return new_fragment

# generate in between species and show the process of the Levenhstein Distance algorithm
def generate_in_between_species (new_main_tree, near_species, file_out=sys.stdout):
	# Define length of main character tree and major species
	length_main_character_tree = len(new_main_tree)
	length_near_species = len(near_species)

	# Declare a list of in between words showing the process
	in_between_species = []
	# Loop over every number until reaching the total lenght of the original word
	for cell in range(length_main_character_tree):
		#print('row number: ', element)
		# Loop over every number until reaching the total lenght of the replacement word
		for number in range(length_near_species):
			#print('column number: ', number)
			# select the number of letters +1 of the original word
			part1 = new_main_tree[:cell+1]
			print('La Distancia de Levenshtein observa una parte del',new_main_tree, ':', part1, file=file_out)
			# select the number of letters +1 of the replacement word
			part2 = near_species[:number+1]
			print('Después observa una parte del', near_species, ':', part2, file=file_out)
			# selected letters of the original words replace the selected letters of the replacement word
			new_species = part1 + near_species[number+1:]
			print('En su intento de comparación reemplaza la parte del', near_species, 'por la parte del', new_main_tree, 'y crea así una nueva especie intermediaria: el', new_species, file=file_out)
			# add in between words to the list
			in_between_species.append(new_species)
			# calculate the similarity between in between words
			print('Calcula las acciones necesarias para reemplazar', new_main_tree[:cell+1], 'por', near_species[:number+1], ': ', Levenshtein.distance(new_main_tree[:cell+1], near_species[:number+1]), '\n', file=file_out)
			# print('\n', file=file_out)

	## Print all in between words
	#print('in between species: ', in_between_species, file=file_out)

	return in_between_species

# Draw a map of all in between species and their distance to main tree
def print_map(sorted_distances, show_distances=True, file_out=sys.stdout):
	# As characters are less wide than high make them a bit wider
	xscale = 2
	# Height of the map
	height = 70
	# Width of the map
	width = int(70 * xscale)
	# Centerpoint of the map
	middle = (int(30 * xscale), 35)

	grid = [[] for x in range(height + 1)]

	centerpoint = sorted_distances[0][0]
	start = middle[0] - max(0, int(len(centerpoint) / 2))

	grid[middle[1]].append((start, start + len(centerpoint), centerpoint))

	for treename, distance in sorted_distances[1:]:
		placed = False

		treename = ' {}({}) '.format(treename, distance) if show_distances else ' {} '.format(treename)
		angle = random.random() * math.pi * 2
		step = 0
		steps = 180
		while (not placed) and step < steps:

			x = int(math.floor(math.cos(angle) * (distance * 2.8)) * xscale + middle[0])
			y = int(math.floor(math.sin(angle) * (distance * 2.8)) + middle[1])

			start = x - max(0, int(len(treename) / 2))
			end = start + len(treename)

			if space_available(grid, start, end, y):
				grid[y].append((start, end, treename))
				placed = True

			angle += (math.pi * 2 / steps)
			step += 1

		if not placed:
			print('Could not place {}'.format(treename), file=file_out)
		# print(angle, x, y)

	for row in grid:
		# Sort by first key of the tuples, start of the word
		row = sorted(row, key=lambda r: r[0])

		if len(row):
			line = ''
			for start, _, treename in row:
				line += ((start) - len(line)) * ' ' + treename

			print(line, file=file_out)

# draw table with all new intermediary species
def print_table(new_main_tree, near_species, in_between_species, file_out=sys.stdout):
	## 8. Print tabel
	print('{} → {}'.format(new_main_tree, near_species), file=file_out)
	print('', file=file_out)
	print(formatTable(in_between_species, 20, 5, cells=len(near_species)), file=file_out)

# Execute functions
if __name__ == '__main__':
	## 1A. Open textfiles & turn textfiles in machine readable lists

	# Cortazar
	txt = 'eucalipto_cortazar.txt'
	# Open textfile
	all_text = openfile(txt)

	# List of trees
	txt1 = 'arboles_simple.txt'
	txt2 = 'arboles_plural.txt'
	all_simple = openfile(txt1)
	all_plural = openfile(txt2)

	## 1B. Turn list of trees into dictionary of single/plural words
	trees = dict(zip(all_simple, all_plural))

	## 2. HOMEPAGE print statements
	## print fragment Cortazar
	with open(txt, 'r') as source:
		fragment = source.read()

	## 2b. Ask user to pick a new main tree
	new_main_tree = pick_new_main_tree_terminal(fragment, all_simple, sys.stdout)

	## 3. Count the similarity between the main tree and the other species in the forest
	distances = calculate_distances(new_main_tree, all_simple)

	## 4. Sort the distances between the trees and the new main tree from the lowest to the highest counts
	sorted_distances = sort_distances(distances)

	# Find the nearest species
	near_species, nearest_species = find_nearest_species(sorted_distances, trees)

	# Print rewritten fragment
	print("\n")
	new_fragment = generate_new_fragment(fragment, new_main_tree, nearest_species)
	print(new_fragment, file=sys.stdout)

	## 6. Compare the similarity between the main character tree and the main species in the forest,
	in_between_species = generate_in_between_species(new_main_tree, near_species, file_out=sys.stdout)

	# Show the sorted distances dictionary
	# print('Sorted distances: ', sorted_distances, file=sys.stdout)
	# print('\n', file=sys.stdout)

	# generate_new_text(fragment, new_main_tree, all_simple, all_plural, trees, sys.stdout)
	## 7. Generate map of the woods
	print_map(sorted_distances, file_out=sys.stdout)

	## 8. Generate intermediary species table
	print_table(new_main_tree, near_species, in_between_species, sys.stdout)