diff --git a/scripts/app.py b/scripts/app.py index d2cf258..85fe50b 100644 --- a/scripts/app.py +++ b/scripts/app.py @@ -16,6 +16,8 @@ from flask import Flask, render_template, request, Response, session from pagedjs import make_pdf from settings import DEBUG, BASEURL, DEFAULT_LANGUAGE, SECRET_KEY +import textwrap + import os from fcntl import lockf, LOCK_EX, LOCK_UN @@ -76,6 +78,12 @@ def get_edition_count_en(): return edition_count +def wrap (text, width): + return'\n'.join(['\n'.join(textwrap.wrap(line, width=width)) for line in text.splitlines()]) + +def read_sources (*paths): + return [ (p, wrap(open(p, 'r').read(), 105)) for p in paths ] + def get_language(): if 'LANGUAGE' in session: return session['LANGUAGE'] @@ -87,10 +95,38 @@ def set_language(language): session.modified = True def index_es(): - return render_template('index.html') + context = { + BASEURL: BASEURL + } + return render_template('index.html', **context) def index_en(): - return render_template('index_en.html') + context = { + BASEURL: BASEURL + } + return render_template('index_en.html', **context) + +""" + Adds the last word of the previous step to the itinerary +""" +def add_last_word_previous_step (itinerary): + new_itinerary = [] + last_word_previous_step = None + + for step in itinerary: + new_itinerary.append( + ( + step[0], + step[1], + step[2], + step[3], + last_word_previous_step + ) + ) + + last_word_previous_step = step[3][-1][0] + + return new_itinerary @app.route('{}/en'.format(BASEURL)) def en(): @@ -125,7 +161,8 @@ def book_es (): author = 'Benito Pérez Gáldos' # Non breaking spaces title = 'Miau' - path = crear_camino(novel, first_word, 'es') + path = add_last_word_previous_step(crear_camino(novel, first_word, 'es')) + complete_sentence = path[-1][1] + path[-1][0] context = { 'title': title, @@ -134,6 +171,8 @@ def book_es (): 'STATIC_DIR': '/static' if DEBUG else PAGEDJS_STATIC_DIR, 'DEBUG': DEBUG, 'edition_count': edition_count, + 'sources': read_sources('paseo.py', 'medialab.py'), + 'complete_sentence': complete_sentence, } html = render_template('book.html', **context) @@ -167,7 +206,8 @@ def book_en (): author = 'Benito Pérez Gáldos' # Non breaking spaces title = 'Marianela' - path = crear_camino(novel, first_word, 'en') + path = add_last_word_previous_step(crear_camino(novel, first_word, 'en')) + complete_sentence = path[-1][1] + path[-1][0] context = { 'title': title, @@ -176,6 +216,8 @@ def book_en (): 'STATIC_DIR': '/static' if DEBUG else PAGEDJS_STATIC_DIR, 'DEBUG': DEBUG, 'edition_count': edition_count, + 'sources': read_sources('paseo.py', 'medialab.py'), + 'complete_sentence': complete_sentence, } html = render_template('book_en.html', **context) diff --git a/scripts/medialab.py b/scripts/medialab.py index 0cf7940..b05a160 100644 --- a/scripts/medialab.py +++ b/scripts/medialab.py @@ -27,7 +27,7 @@ def limpiar_texto(fragmento): fragmento_limpio = ' '.join(fragmento_limpio) return fragmento_limpio -def crear_base_datos(nombre_texto, lenguaje='es'): +def crear_base_datos(nombre_texto, idioma='es'): # Abrir el archivo de texto para crear la base de datos archivo = open(nombre_texto, 'r') fragmento = archivo.read() @@ -35,11 +35,11 @@ def crear_base_datos(nombre_texto, lenguaje='es'): fragmento_limpio = limpiar_texto(fragmento) # Tokenización del fragmento de texto - if lenguaje == 'es': - doc = nlp(fragmento_limpio) + if idioma == 'en': + doc = nlp_en(fragmento_limpio) doc_len = len(doc) else: - doc = nlp_en(fragmento_limpio) + doc = nlp(fragmento_limpio) doc_len = len(doc) palabras_arboles = {} #Verbos, sustantivos, adverbios y adjetivos diff --git a/scripts/paseo.py b/scripts/paseo.py index c164bb9..0730612 100644 --- a/scripts/paseo.py +++ b/scripts/paseo.py @@ -59,7 +59,7 @@ def path(word, words_tree, words_path, trees): # Add the current step, and the tree to the itinerary itinerary.append(( - current_step, + current_step, previous_steps, tree, markov_decision_traces @@ -77,12 +77,12 @@ def path(word, words_tree, words_path, trees): return itinerary # Genera un camino a partir de un texto y una palabra del texto -def crear_camino(nombre_archivo, palabra_inicial, lenguaje='es'): +def crear_camino(nombre_archivo, palabra_inicial, idioma='es'): trees = load_trees_from_json() shuffle(trees) #print("Starting to read text") - (palabras_arboles, palabras_camino) = crear_base_datos(nombre_archivo, lenguaje) + (palabras_arboles, palabras_camino) = crear_base_datos(nombre_archivo, idioma) #print("Amount of tree words: ", len(palabras_arboles)) diff --git a/scripts/static/fira-mono/FiraMono-Bold.ttf b/scripts/static/fira-mono/FiraMono-Bold.ttf new file mode 100644 index 0000000..db6f63f Binary files /dev/null and b/scripts/static/fira-mono/FiraMono-Bold.ttf differ diff --git a/scripts/static/fira-mono/FiraMono-Medium.ttf b/scripts/static/fira-mono/FiraMono-Medium.ttf new file mode 100644 index 0000000..892c124 Binary files /dev/null and b/scripts/static/fira-mono/FiraMono-Medium.ttf differ diff --git a/scripts/static/fira-mono/FiraMono-Regular.ttf b/scripts/static/fira-mono/FiraMono-Regular.ttf new file mode 100644 index 0000000..3910f17 Binary files /dev/null and b/scripts/static/fira-mono/FiraMono-Regular.ttf differ diff --git a/scripts/static/fira-mono/OFL.txt b/scripts/static/fira-mono/OFL.txt new file mode 100644 index 0000000..1ba1596 --- /dev/null +++ b/scripts/static/fira-mono/OFL.txt @@ -0,0 +1,93 @@ +Copyright (c) 2012-2013, The Mozilla Corporation and Telefonica S.A. + +This Font Software is licensed under the SIL Open Font License, Version 1.1. +This license is copied below, and is also available with a FAQ at: +http://scripts.sil.org/OFL + + +----------------------------------------------------------- +SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007 +----------------------------------------------------------- + +PREAMBLE +The goals of the Open Font License (OFL) are to stimulate worldwide +development of collaborative font projects, to support the font creation +efforts of academic and linguistic communities, and to provide a free and +open framework in which fonts may be shared and improved in partnership +with others. + +The OFL allows the licensed fonts to be used, studied, modified and +redistributed freely as long as they are not sold by themselves. The +fonts, including any derivative works, can be bundled, embedded, +redistributed and/or sold with any software provided that any reserved +names are not used by derivative works. The fonts and derivatives, +however, cannot be released under any other type of license. The +requirement for fonts to remain under this license does not apply +to any document created using the fonts or their derivatives. + +DEFINITIONS +"Font Software" refers to the set of files released by the Copyright +Holder(s) under this license and clearly marked as such. This may +include source files, build scripts and documentation. + +"Reserved Font Name" refers to any names specified as such after the +copyright statement(s). + +"Original Version" refers to the collection of Font Software components as +distributed by the Copyright Holder(s). + +"Modified Version" refers to any derivative made by adding to, deleting, +or substituting -- in part or in whole -- any of the components of the +Original Version, by changing formats or by porting the Font Software to a +new environment. + +"Author" refers to any designer, engineer, programmer, technical +writer or other person who contributed to the Font Software. + +PERMISSION & CONDITIONS +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Font Software, to use, study, copy, merge, embed, modify, +redistribute, and sell modified and unmodified copies of the Font +Software, subject to the following conditions: + +1) Neither the Font Software nor any of its individual components, +in Original or Modified Versions, may be sold by itself. + +2) Original or Modified Versions of the Font Software may be bundled, +redistributed and/or sold with any software, provided that each copy +contains the above copyright notice and this license. These can be +included either as stand-alone text files, human-readable headers or +in the appropriate machine-readable metadata fields within text or +binary files as long as those fields can be easily viewed by the user. + +3) No Modified Version of the Font Software may use the Reserved Font +Name(s) unless explicit written permission is granted by the corresponding +Copyright Holder. This restriction only applies to the primary font name as +presented to the users. + +4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font +Software shall not be used to promote, endorse or advertise any +Modified Version, except to acknowledge the contribution(s) of the +Copyright Holder(s) and the Author(s) or with their explicit written +permission. + +5) The Font Software, modified or unmodified, in part or in whole, +must be distributed entirely under this license, and must not be +distributed under any other license. The requirement for fonts to +remain under this license does not apply to any document created +using the Font Software. + +TERMINATION +This license becomes null and void if any of the above conditions are +not met. + +DISCLAIMER +THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE +COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL +DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM +OTHER DEALINGS IN THE FONT SOFTWARE. diff --git a/scripts/static/images/markov_1.jpeg b/scripts/static/images/markov_1.jpeg new file mode 100644 index 0000000..ead0555 Binary files /dev/null and b/scripts/static/images/markov_1.jpeg differ diff --git a/scripts/static/images/markov_2.jpeg b/scripts/static/images/markov_2.jpeg new file mode 100644 index 0000000..704c045 Binary files /dev/null and b/scripts/static/images/markov_2.jpeg differ diff --git a/scripts/templates/book.html b/scripts/templates/book.html index 41e6cb1..a130a41 100644 --- a/scripts/templates/book.html +++ b/scripts/templates/book.html @@ -28,31 +28,56 @@ font-style: italic; } + @font-face { + font-family: "Fira mono"; + src: url({{ STATIC_DIR }}/fira-mono/FiraMono-Regular.ttf) format("truetype"); + font-weight: 400; + font-style: normal; + } + + @font-face { + font-family: "Fira mono"; + src: url({{ STATIC_DIR }}/fira-mono/FiraMono-Medium.ttf) format("truetype"); + font-weight: 500; + font-style: normal; + } + + @font-face { + font-family: "Fira mono"; + src: url({{ STATIC_DIR }}/fira-mono/FiraMono-Bold.ttf) format("truetype"); + font-weight: 600; + font-style: bold; + } + @page { size: 210mm 297mm; + margin: 10mm 20mm 15mm 20mm; + } + + @page chain { margin: 0mm 10mm 15mm 10mm; } @page title { - margin: 10mm 10mm 15mm 10mm; background: green; + margin: 10mm 10mm 15mm 10mm; } - @page: left { + @page:left { @bottom-left { text-align: left; content: counter(page); } } - @page: right { + @page:right { @bottom-right { text-align: right; content: counter(page); } } - @page: empty { + @page:empty { @bottom-right { content: ''; } @@ -72,7 +97,10 @@ } } - + #chapter-chain { + page: chain; + } + :root { --font-size: 10pt; --line-height: 15pt; @@ -85,11 +113,6 @@ line-height: var(--line-height); } - ul { - margin: 0; - padding: 0; - } - h1 { page: title; color: white; @@ -105,6 +128,24 @@ font-style: normal; } + h2 { + font-family: Serreria; + font-size: 18pt; + line-height: 24pt; + } + + ul { + margin: 0 0 0 1.1em; + padding: 0; + list-style-type: none; + } + + ul li:before { + content: '– '; + position: absolute; + margin-left: -1.1em; + } + section.step { page-break-before: always; text-align: center; @@ -114,6 +155,10 @@ padding: 0; } + section.step.first { + page-break-before: right; + } + section.step_content { overflow: hidden; position: absolute; @@ -150,6 +195,11 @@ padding: 0; } + .traces li:before, + .options li:before { + content: ''; + } + .sentence { z-index: 1; position: absolute; @@ -162,7 +212,6 @@ /* background: white; */ } - .tree { z-index: 1; position: absolute; @@ -182,6 +231,10 @@ /* font-style: italic; */ } + .last-word-previous-step [data-picked] { + text-decoration: none; + } + [data-picked]::after { content: ' → '; text-decoration: none; @@ -195,6 +248,90 @@ .traces> :last-child [data-picked]::after { display: none; } + + .chapter { + page-break-before: right; + } + + .chapter#chapter-introduction, + .chapter#chapter-description, + .chapter#chapter-technical-description, + .chapter#chapter-credits { + width: 65%; + } + + .pagedjs_right_page .chapter#chapter-introduction, + .pagedjs_right_page .chapter#chapter-description, + .pagedjs_right_page .chapter#chapter-technical-description, + .pagedjs_right_page .chapter#chapter-credits { + margin-left: 30%; + } + + .pagedjs_right_page .sources { + float: left; + margin-left: calc(-1 * (40% + 3em)); + width: 40%; + } + + + .pagedjs_left_page .sources { + float: left; + margin-right: calc(-1 * (100% + 3em)); + width: 40%; + } + + .sources a { + word-break: break-all; + } + + a { + text-decoration: underline dotted; + color: currentColor; + } + + pre, code, table { + font-family: "Fira mono"; + font-size: .8rem; + } + + th { + font-weight: normal; + } + + h4 { + font-size: 1rem; + } + + h4:first-child { + margin-top: 0; + } + + + .pagedjs_right_page p.images { + margin-left: -25%; + } + + .pagedjs_left_page p.images { + margin-right: -25%; + } + + img { + display: inline-block; + max-width: 45%; + vertical-align: top; + } + + .complete_sentence { + height: 282mm; + text-align: center; + display: flex; + flex-direction: column; + justify-content: center; + page-break-before: right; + page-break-after: left; + padding-left: 15mm; + padding-right: 15mm; + } {% if DEBUG %} @@ -205,7 +342,47 @@

Paseo por los árboles de Madrid con {{ author }} y {{ title }}

-
+
+

Paseo por los árboles de Madrid con {{ author }} y {{ title }}

+ {% for sentence, previous_steps, tree, traces, last_word_previous_step in path %} +
+
+ {{ previous_steps }} +
+
+
    + {% if last_word_previous_step %} +
  • +
      +
    • {{ last_word_previous_step}}
    • +
    +
  • + {% endif %} + {% for word, dice, options in traces %} +
  • +
      + {% for option in options %} +
    • + {{ option }} +
    • + {% endfor %} +
    + +
  • + {% endfor %} +
+
+
+ {{ tree.properties.NOMBRE_COMUN }} en {{ tree.properties.MINTDIRECCIONAUX }} +
+
+ {% endfor %} +
+ {{ complete_sentence }} +
+
+ +

Introducción

Paseo por los árboles de Madrid es un libro en la Editorial Algoliteraria: crear alianzas con los árboles.
El autor de este libro es el algoritmo de las cadenas de Markov. Genera simultáneamente @@ -244,45 +421,26 @@

- {% for sentence, previous_steps, tree, traces in path %} -
-

Poema & Paseo

-
- {{ previous_steps }} -
-
-
    - {% for word, dice, options in traces %} -
  • -
      - {% for option in options %} -
    • - {{ option }} -
    • - {% endfor %} -
    - +
    +

    Descripción general de las cadenas de Markov

    +
    +

    Fuentes

    +
    -
    - {{ tree.properties.NOMBRE_COMUN }} en {{ tree.properties.MINTDIRECCIONAUX }} -
    -
    - {% endfor %} - -
    -

    Descripción general de las cadenas de Markov

    -

    Fuentes

    -

    https://spectrum.ieee.org/andrey-markov-and-claude-shannon-built-the-first-language-generation-models - http://langvillea.people.cofc.edu/MCapps7.pdf - https://www.irishtimes.com/news/science/that-s-maths-andrey-markov-s-brilliant-ideas-are-still-bearing-fruit-1.3220929 - http://www.alpha60.de/research/markov/DavidLink_TracesOfTheMouth_2006.pdf -

    Historias

    Andrey Andreyevich Markov fue un matemático ruso que vivió entre 1856 y 1922. Sus estudios más famosos fueron con las cadenas de Markov, un algoritmo que permite predecir los cambios futuros una vez que se conoce el @@ -328,9 +486,10 @@ Aunque Markov hubiera tenido más tiempo y mejor vista para llevar a cabo sus experimentos, las extensiones habrían sido muy difíciles de completar, dada la época preinformática en la que vivió, en la que los esfuerzos computacionales debían pagarse en años-hombre.

    -

    -
    These images show Markov’s original notes in computing the probabilities - needed for his Pushkin chain. +

    +
    + Estas imágenes muestran las notas originales de Markov al calcular las probabilidades necesarias para su + cadena Pushkin.

    Influencia

    Algunos de los conceptos centrales de Markov en torno a la probabilidad y el lenguaje se extendieron por el @@ -402,16 +561,25 @@ También aquí las cadenas de Markov han asumido gran parte del trabajo.

    -
    +

    Descripción técnica de las cadenas de Markov

    -

    Fuentes:

    -

    https://en.wikipedia.org/wiki/Examples_of_Markov_chains - https://higherkindedtripe.wordpress.com/2012/02/26/markov-chains-or-daddy-where-does-spam-come-from/ - https://towardsdatascience.com/predicting-the-weather-with-markov-chains-a34735f0c4df -

    +
    +

    Fuentes:

    + +

    En un proceso de Markov podemos predecir los cambios futuros una vez que conocemos el estado actual. Wikipedia describe muy bien la diferencia entre las cadenas de Markov y otros sistemas: "Un juego de serpientes y escaleras o cualquier otro juego cuyas jugadas se determinan enteramente por los dados es una @@ -751,12 +919,15 @@

    -
    -

    Código

    - +
    +

    Código

    + {% for path, source in sources %} +

    {{ path }}

    +
    {{ source }}
    + {% endfor %}
    -
    +

    Créditos

    Este libro es una creación de Anaïs Berck para Medialab como parte del programa "Residencia Cultura Digital" iniciado por el Gobierno Flamenco.
    En esta obra Anaïs Berck está representadx por:

    diff --git a/scripts/templates/book_en.html b/scripts/templates/book_en.html index 906832a..399826f 100644 --- a/scripts/templates/book_en.html +++ b/scripts/templates/book_en.html @@ -9,50 +9,75 @@ {% if DEBUG %} @@ -203,375 +340,578 @@ -

    Walk along the trees of Madrid con {{ author }} y {{ title }}

    +

    Walk along the trees of Madrid with {{ author }} and {{ title }}

    + + +
    +

    Walk along the trees of Madrid with {{ author }} and {{ title }}

    + {% for sentence, previous_steps, tree, traces, last_word_previous_step in path %} +
    +
    + {{ previous_steps }} +
    +
    +
      + {% if last_word_previous_step %} +
    • +
        +
      • {{ last_word_previous_step}}
      • +
      +
    • + {% endif %} + {% for word, dice, options in traces %} +
    • +
        + {% for option in options %} +
      • + {{ option }} +
      • + {% endfor %} +
      + +
    • + {% endfor %} +
    +
    +
    + {{ tree.properties.NOMBRE_COMUN }} on {{ tree.properties.MINTDIRECCIONAUX }} +
    +
    + {% endfor %} +
    + {{ complete_sentence }} +
    +
    -
    + +

    Introduction

    -

    Walk along the trees of Madrid is a book in the An Algoliterary Publishing House: making kin with trees.
    The author of this book is the Markov chains algorithm. It simultaneously generates a poem and a walk along the trees of the neighbourhood Las Letras in the centre of Madrid.
    The poem is created from a novel chosen by the reader. The reader has the choice between two novels by great Spanish writers of the 19th century:

    -
      -
    • The Swan of Vila Morta by the feminist writer Emilia Pardo Bazán published in 1891.
    • -
    • Marianela by the writer Benito Pérez Galdós, published in 1878.
    • -
    -

    The walk is generated from the database with trees in Madrid, Un Alcorque, un Árbol. Each significant word - noun, adjective, verb or adverb - is related to a tree in Madrid's neighbourhood las Letras. The other words create the path between the different trees. Thus one can walk through the neighbourhood reciting parts of the poem to each tree along the promenade.
    This book is by definition infinite and unique.
    It is created by Anaïs Berck. It is a pseudonym that represents a collaboration between humans, algorithms and trees. Anaïs Berck explores the specificities of human intelligence in the company of artificial and plant intelligences.
    An Algoliterary Publishing is a collection of publications in which algorithms are the authors of unusual books. This book was created as part of a residency at the center for contemporary arts Medialab Prado in Madrid. The residency was granted by the programme "Residency Digital Culture" initiated by the Flemish Government.

    -

    In this work Anaïs Berck is represented by:

    -
      -
    • the Markov chains algorithm, of which a description is given in this book,
    • -
    • the trees of Madrid, which are geolocated between Medialab Prado, Plaza del Sol and Atocha Renfe, and present in the database Un Alcorque, un Árbol,
    • -
    • the human beings Emilia Pardo Bazán, Benito Pérez Gáldos, Jaime Munárriz, Luis Morell, An Mertens, Eva Marina Gracia, Gijs de Heij, Ana Isabel Garrido Mártinez, Alfredo Calosci, Daniel Arribas Hedo.
    • -
    +

    Walk along the trees of Madrid is a book in the An Algoliterary Publishing House: making kin with + trees. +
    The author of this book is the Markov chains algorithm. It simultaneously generates a poem and a + walk along the trees of the neighbourhood Las Letras in the centre of Madrid.
    The poem is created from a novel + chosen by the reader. The reader has the choice between two novels by great Spanish writers of the 19th century: +

    +
      +
    • The Swan of Vila Morta by the feminist writer Emilia Pardo Bazán published in 1891.
    • +
    • Marianela by the writer Benito Pérez Galdós, published in 1878.
    • +
    +

    The walk is generated from the database with trees in Madrid, Un Alcorque, un Árbol. Each significant word - + noun, adjective, verb or adverb - is related to a tree in Madrid's neighbourhood las Letras. The other words + create the path between the different trees. Thus one can walk through the neighbourhood reciting parts of the + poem to each tree along the promenade.
    This book is by definition infinite and unique.
    It is created by + Anaïs Berck. It is a pseudonym that represents a collaboration between humans, algorithms and trees. Anaïs Berck + explores the specificities of human intelligence in the company of artificial and plant intelligences.
    An + Algoliterary Publishing is a collection of publications in which algorithms are the authors of unusual books. This + book was created as part of a residency at the center for contemporary arts Medialab Prado in Madrid. The + residency was granted by the programme "Residency Digital Culture" initiated by the Flemish Government. +

    +

    In this work Anaïs Berck is represented by:

    +
      +
    • the Markov chains algorithm, of which a description is given in this book,
    • +
    • the trees of Madrid, which are geolocated between Medialab Prado, Plaza del Sol and Atocha Renfe, and present + in the database Un Alcorque, un Árbol,
    • +
    • the human beings Emilia Pardo Bazán, Benito Pérez Gáldos, Jaime Munárriz, Luis Morell, An Mertens, Eva Marina + Gracia, Gijs de Heij, Ana Isabel Garrido Mártinez, Alfredo Calosci, Daniel Arribas Hedo.
    • +
    - {% for sentence, previous_steps, tree, traces in path %} -
    -

    Poem & Walk

    -
    - {{ previous_steps }} -
    -
    -
      - {% for word, dice, options in traces %} -
    • -
        - {% for option in options %} -
      • - {{ option }} -
      • - {% endfor %} -
      - +
      +

      General description of the Markov Chains

      +
      +

      Sources

      +
      -
      - {{ tree.properties.NOMBRE_COMUN }} en {{ tree.properties.MINTDIRECCIONAUX }} -
      +

      Histories

      +

      Andrey Andreyevich Markov was a Russian mathematician who lived between 1856 and 1922. His most famous studies + were with Markov chains, an algorithm that allows to predict future changes once one knows the current state . The + first paper on the subject was published in 1906. He was also interested in literature. He tried establishing a + linguistic mathematical model using Markov Chains by manually counting letters of Alexander Pusjkins verse novel + ​Eugene Onegin​. Next, he applied the method to the novel Childhood Years of Bagrov's Grandson by S.T. + Aksakov. This links the Markov Chains directly to the field of literature, text and language. And the link will + live firmly throughout the history of this algorithm.
      The following text is based on Oscar Schwartz' + article for IEEE Spectrum, Andrey + Markov & Claude Shannon Counted Letters to Build the First Language-Generation Models.
      In 1913, + Andrey Markov sat down in his study in St. Petersburg with a copy of Alexander Pushkin’s 19th century verse novel, + Eugene Onegin, a literary classic at the time. This work + comprises almost 400 stanzas of iambic tetrameter. Markov, however, did not start reading Pushkin’s famous text. + Rather, he took a pen and piece of drafting paper, and wrote out the first 20,000 letters of the book in one long + string of letters, eliminating all punctuation and spaces. Then he arranged these letters in 200 grids (10-by-10 + characters each) and began counting the vowels in every row and column, tallying the results.
      In separating the + vowels from the consonants, Markov was testing a theory of probability that he had developed in 1906 and that we + now call a Markov Process or Markov Chain. Up until that point, the field of probability had been mostly limited + to analyzing phenomena like roulette or coin flipping, where the outcome of previous events does not change the + probability of current events. But Markov felt that most things happen in chains of causality and are dependent on + prior outcomes. He wanted a way of modeling these occurrences through probabilistic analysis.
      Language, Markov + believed, was an example of a system where past occurrences partly determine present outcomes. To demonstrate + this, he wanted to show that in a text like Pushkin’s novel, the chance of a certain letter appearing at some + point in the text is dependent, to some extent, on the letter that came before it.

      +

      To do so, Markov began counting vowels in Eugene Onegin, and found that 43 percent of letters were vowels and 57 + percent were consonants. Then Markov separated the 20,000 letters into pairs of vowels and consonant combinations. + He found that there were 1,104 vowel-vowel pairs, 3,827 consonant-consonant pairs, and 15,069 vowel-consonant and + consonant-vowel pairs. What this demonstrated, statistically speaking, was that for any given letter in Pushkin’s + text, if it was a vowel, odds were that the next letter would be a consonant, and vice versa.

      +

      Markov used this analysis to demonstrate that Pushkin’s Eugene Onegin wasn’t just a random distribution of + letters but had some underlying statistical qualities that could be modeled. The enigmatic research paper that + came out of this study, entitled An + Example of Statistical Investigation of the Text Eugene Onegin Concerning the Connection of Samples in + Chains was not widely cited in Markov’s lifetime, and not translated to English until 2006. Markov was + forced to stop his letter-counting experiments, when he had nearly completely lost his sight due to glaucoma. Even + if Markov had had more time and better eyesight to carry his experiments further, extensions would have been very + difficult to complete, given the precomputer era he lived in, when computational efforts had to be paid in + man-years.

      +

      +
      These images show Markov’s original notes in computing the probabilities + needed for his Pushkin chain. +

      +

      Influence

      +

      Some of Markov's central concepts around probability and language spread across the globe, eventually finding + re-articulation in Claude Shannon’s hugely influential paper, A Mathematical Theory + of Communication which came out in 1948.
      Shannon’s paper outlined a way to precisely measure the quantity + of information in a message, and in doing so, set the foundations for a theory of information that would come to + define the digital age. Shannon was fascinated by Markov’s idea that in a given text, the likelihood of some + letter or word appearing could be approximated. Like Markov, Shannon demonstrated this by performing some textual + experiments that involved making a statistical model of language, then took a step further by trying to use the + model to generate text according to those statistical rules.
      In an initial control experiment, he started by + generating a sentence by picking letters randomly from a 27-symbol alphabet (26 letters, plus a space), and got + the following output:

      +

      XFOML RXKHRJFFJUJ ZLPWCFWKCYJ FFJEYVKCQSGHYD QPAAMKBZAACIBZLHJQD

      +

      The sentence was meaningless noise, Shannon said, because when we communicate we don’t choose letters with equal + probability. As Markov had shown, consonants are more likely than vowels. But at a greater level of granularity, + E’s are more common than S’s which are more common than Q’s. To account for this, Shannon amended his original + alphabet so that it modeled the probability of English more closely—he was 11 percent more likely to draw an E + from the alphabet than a Q. When he again drew letters at random from this recalibrated corpus he got a sentence + that came a bit closer to English.

      +

      OCRO HLI RGWR NMIELWIS EU LL NBNESEBYA TH EEI ALHENHTTPA OOBTTVA NAH BRL.

      +

      In a series of subsequent experiments, Shannon demonstrated that as you make the statistical model even more + complex, you get increasingly more comprehensible results. Shannon, via Markov, revealed a statistical framework + for the English language, and showed that by modeling this framework—by analyzing the dependent probabilities of + letters and words appearing in combination with each other—he could actually generate language.

      +

      The more complex the statistical model of a given text, the more accurate the language generation becomes—or as + Shannon put it, the greater “resemblance to ordinary English text.” In the final experiment, Shannon drew from a + corpus of words instead of letters and achieved the following:

      +

      THE HEAD AND IN FRONTAL ATTACK ON AN ENGLISH WRITER THAT THE CHARACTER OF THIS POINT IS THEREFORE ANOTHER METHOD + FOR THE LETTERS THAT THE TIME OF WHO EVER TOLD THE PROBLEM FOR AN UNEXPECTED.

      +

      For both Shannon and Markov, the insight that language’s statistical properties could be modeled offered a way to + re-think broader problems that they were working on. For Markov, it extended the study of stochasticity beyond + mutually independent events, paving the way for a new era in probability theory. For Shannon, it helped him + formulate a precise way of measuring and encoding units of information in a message, which revolutionized + telecommunications and, eventually, digital communication. But their statistical approach to language modeling and + generation also ushered in a new era for natural language processing, which has ramified through the digital age + to this day. As David Link notes in his article, Traces of the Mouth, Markov's efforts in retrospect + “represent an early and momentous attempt to understand the phenomenon of language in mathematical terms.” + It's not an exaggeration to say that Markov's analysis of text is in principle similar to what Google and + other firms now routinely carry out on a massive scale: analyzing words in books and internet documents, the order + in which the words occur, analyzing search phrases, detecting spam and so on.

      +

      Applications

      +

      Since Markov chains can be designed to model many real-world processes, they are used in a wide variety of + situations. They appear in physics and chemistry when probabilities are used for unknown quantities. In + information processing, they have a role in pattern recognition, automatic speech analysis and synthesis and data + compression. They are used by meteorologists, ecologists and biologists. Other applications include the control of + driverless cars, machine translation, queuing patterns, and prediction of population growth, asset prices, + currency exchange rates and market upheavals. Also artists have used Markov chains, such as musician Iannis + Xenakis who developed “Free Stochastic Music” based on Markov chains.

      +

      In 2006 – the 100th anniversary of Markov's paper – Philipp Von Hilgers and Amy Langville summarized the five greatest applications of Markov chains. This + includes the one that is used by most of us on a daily basis, Google's Page Rank. Every time we search on the + internet, the ranking of webpages is based on the solution to massive Markov chain. You can say that all the web + pages are states, and the links between them are transitions possessing specific probabilities. In other words, we + can say that no matter what you’re searching on Google, there’s a finite probability of you ending up on a + particular web page. If you use Gmail, you must’ve noticed their Auto-fill feature. This feature automatically + predicts your sentences to help you write emails quickly.
      And last but not least, have you ever wondered why + spam has all those hilarious nonsensical strings of words in it? They’re pretty odd constructions, not as random + as if you picked words randomly out of a hat, almost grammatical much of the time, but still clearly gibberish. + Also here the Markov chains have taken on a lot of the work.

      - {% endfor %} -
      -

      General description of the Markov Chains

      -

      Sources

      -

      https://spectrum.ieee.org/andrey-markov-and-claude-shannon-built-the-first-language-generation-models - http://langvillea.people.cofc.edu/MCapps7.pdf - https://www.irishtimes.com/news/science/that-s-maths-andrey-markov-s-brilliant-ideas-are-still-bearing-fruit-1.3220929 - http://www.alpha60.de/research/markov/DavidLink_TracesOfTheMouth_2006.pdf -

      -

      Histories

      -

      Andrey Andreyevich Markov was a Russian mathematician who lived between 1856 and 1922. His most famous studies were with Markov chains, an algorithm that allows to predict future changes once one knows the current state . The first paper on the subject was published in 1906. He was also interested in literature. He tried establishing a linguistic mathematical model using Markov Chains by manually counting letters of Alexander Pusjkins verse novel ​Eugene Onegin​. Next, he applied the method to the novel Childhood Years of Bagrov's Grandson by S.T. Aksakov. This links the Markov Chains directly to the field of literature, text and language. And the link will live firmly throughout the history of this algorithm.
      The following text is based on Oscar Schwartz' article for IEEE Spectrum, Andrey Markov & Claude Shannon Counted Letters to Build the First Language-Generation Models.
      In 1913, Andrey Markov sat down in his study in St. Petersburg with a copy of Alexander Pushkin’s 19th century verse novel, Eugene Onegin, a literary classic at the time. This work comprises almost 400 stanzas of iambic tetrameter. Markov, however, did not start reading Pushkin’s famous text. Rather, he took a pen and piece of drafting paper, and wrote out the first 20,000 letters of the book in one long string of letters, eliminating all punctuation and spaces. Then he arranged these letters in 200 grids (10-by-10 characters each) and began counting the vowels in every row and column, tallying the results.
      In separating the vowels from the consonants, Markov was testing a theory of probability that he had developed in 1906 and that we now call a Markov Process or Markov Chain. Up until that point, the field of probability had been mostly limited to analyzing phenomena like roulette or coin flipping, where the outcome of previous events does not change the probability of current events. But Markov felt that most things happen in chains of causality and are dependent on prior outcomes. He wanted a way of modeling these occurrences through probabilistic analysis.
      Language, Markov believed, was an example of a system where past occurrences partly determine present outcomes. To demonstrate this, he wanted to show that in a text like Pushkin’s novel, the chance of a certain letter appearing at some point in the text is dependent, to some extent, on the letter that came before it.

      -

      To do so, Markov began counting vowels in Eugene Onegin, and found that 43 percent of letters were vowels and 57 percent were consonants. Then Markov separated the 20,000 letters into pairs of vowels and consonant combinations. He found that there were 1,104 vowel-vowel pairs, 3,827 consonant-consonant pairs, and 15,069 vowel-consonant and consonant-vowel pairs. What this demonstrated, statistically speaking, was that for any given letter in Pushkin’s text, if it was a vowel, odds were that the next letter would be a consonant, and vice versa.

      -

      Markov used this analysis to demonstrate that Pushkin’s Eugene Onegin wasn’t just a random distribution of letters but had some underlying statistical qualities that could be modeled. The enigmatic research paper that came out of this study, entitled An Example of Statistical Investigation of the Text Eugene Onegin Concerning the Connection of Samples in Chains was not widely cited in Markov’s lifetime, and not translated to English until 2006. Markov was forced to stop his letter-counting experiments, when he had nearly completely lost his sight due to glaucoma. Even if Markov had had more time and better eyesight to carry his experiments further, extensions would have been very difficult to complete, given the precomputer era he lived in, when computational efforts had to be paid in man-years.

      -

      -
      These images show Markov’s original notes in computing the probabilities needed for his Pushkin chain.

      -

      Influence

      -

      Some of Markov's central concepts around probability and language spread across the globe, eventually finding re-articulation in Claude Shannon’s hugely influential paper, A Mathematical Theory of Communication which came out in 1948.
      Shannon’s paper outlined a way to precisely measure the quantity of information in a message, and in doing so, set the foundations for a theory of information that would come to define the digital age. Shannon was fascinated by Markov’s idea that in a given text, the likelihood of some letter or word appearing could be approximated. Like Markov, Shannon demonstrated this by performing some textual experiments that involved making a statistical model of language, then took a step further by trying to use the model to generate text according to those statistical rules.
      In an initial control experiment, he started by generating a sentence by picking letters randomly from a 27-symbol alphabet (26 letters, plus a space), and got the following output:

      -

      XFOML RXKHRJFFJUJ ZLPWCFWKCYJ FFJEYVKCQSGHYD QPAAMKBZAACIBZLHJQD

      -

      The sentence was meaningless noise, Shannon said, because when we communicate we don’t choose letters with equal probability. As Markov had shown, consonants are more likely than vowels. But at a greater level of granularity, E’s are more common than S’s which are more common than Q’s. To account for this, Shannon amended his original alphabet so that it modeled the probability of English more closely—he was 11 percent more likely to draw an E from the alphabet than a Q. When he again drew letters at random from this recalibrated corpus he got a sentence that came a bit closer to English.

      -

      OCRO HLI RGWR NMIELWIS EU LL NBNESEBYA TH EEI ALHENHTTPA OOBTTVA NAH BRL.

      -

      In a series of subsequent experiments, Shannon demonstrated that as you make the statistical model even more complex, you get increasingly more comprehensible results. Shannon, via Markov, revealed a statistical framework for the English language, and showed that by modeling this framework—by analyzing the dependent probabilities of letters and words appearing in combination with each other—he could actually generate language.

      -

      The more complex the statistical model of a given text, the more accurate the language generation becomes—or as Shannon put it, the greater “resemblance to ordinary English text.” In the final experiment, Shannon drew from a corpus of words instead of letters and achieved the following:

      -

      THE HEAD AND IN FRONTAL ATTACK ON AN ENGLISH WRITER THAT THE CHARACTER OF THIS POINT IS THEREFORE ANOTHER METHOD FOR THE LETTERS THAT THE TIME OF WHO EVER TOLD THE PROBLEM FOR AN UNEXPECTED.

      -

      For both Shannon and Markov, the insight that language’s statistical properties could be modeled offered a way to re-think broader problems that they were working on. For Markov, it extended the study of stochasticity beyond mutually independent events, paving the way for a new era in probability theory. For Shannon, it helped him formulate a precise way of measuring and encoding units of information in a message, which revolutionized telecommunications and, eventually, digital communication. But their statistical approach to language modeling and generation also ushered in a new era for natural language processing, which has ramified through the digital age to this day. As David Link notes in his article, Traces of the Mouth, Markov's efforts in retrospect “represent an early and momentous attempt to understand the phenomenon of language in mathematical terms.” It's not an exaggeration to say that Markov's analysis of text is in principle similar to what Google and other firms now routinely carry out on a massive scale: analyzing words in books and internet documents, the order in which the words occur, analyzing search phrases, detecting spam and so on.

      -

      Applications

      -

      Since Markov chains can be designed to model many real-world processes, they are used in a wide variety of situations. They appear in physics and chemistry when probabilities are used for unknown quantities. In information processing, they have a role in pattern recognition, automatic speech analysis and synthesis and data compression. They are used by meteorologists, ecologists and biologists. Other applications include the control of driverless cars, machine translation, queuing patterns, and prediction of population growth, asset prices, currency exchange rates and market upheavals. Also artists have used Markov chains, such as musician Iannis Xenakis who developed “Free Stochastic Music” based on Markov chains.

      -

      In 2006 – the 100th anniversary of Markov's paper – Philipp Von Hilgers and Amy Langville summarized the five greatest applications of Markov chains. This includes the one that is used by most of us on a daily basis, Google's Page Rank. Every time we search on the internet, the ranking of webpages is based on the solution to massive Markov chain. You can say that all the web pages are states, and the links between them are transitions possessing specific probabilities. In other words, we can say that no matter what you’re searching on Google, there’s a finite probability of you ending up on a particular web page. If you use Gmail, you must’ve noticed their Auto-fill feature. This feature automatically predicts your sentences to help you write emails quickly.
      And last but not least, have you ever wondered why spam has all those hilarious nonsensical strings of words in it? They’re pretty odd constructions, not as random as if you picked words randomly out of a hat, almost grammatical much of the time, but still clearly gibberish. Also here the Markov chains have taken on a lot of the work.

      -
      - -
      +

      Technical description of the Markov Chain

      -

      Sources

      -

      https://en.wikipedia.org/wiki/Examples_of_Markov_chains - +

      Sources

      + + + +

      In a Markov process we can predict future changes once we know the current state. Wikipedia gives a very good + description of the difference between Markov chains and other systems: 'A game of snakes and ladders or any + other game whose moves are determined entirely by dice is a Markov chain, indeed, an absorbing Markov chain. This + is in contrast to card games such as blackjack, where the cards represent a 'memory' of the past moves. To + see the difference, consider the probability for a certain event in the game. In the above-mentioned dice games, + the only thing that matters is the current state of the board. The next state of the board depends on the current + state, and the next roll of the dice. It doesn't depend on how things got to their current state. In a game + such as blackjack, a player can gain an advantage by remembering which cards have already been shown (and hence + which cards are no longer in the deck), so the next state (or hand) of the game is not independent of the past + states.'
      So, for a Markov process, only the current state determines the next state; the history of the + system has no impact. For that reason we describe a Markov process as memoryless. What happens next is determined + completely by the current state and the transition probabilities.

      +

      In what follows, we describe a classic working of the Markov chains, next to a simplified version we used to + develop a Markov game and the code for this book.

      +

      Classic version

      +

      This example is taken from the following source: https://higherkindedtripe.wordpress.com/2012/02/26/markov-chains-or-daddy-where-does-spam-come-from/ - https://towardsdatascience.com/predicting-the-weather-with-markov-chains-a34735f0c4df

      -

      In a Markov process we can predict future changes once we know the current state. Wikipedia gives a very good description of the difference between Markov chains and other systems: 'A game of snakes and ladders or any other game whose moves are determined entirely by dice is a Markov chain, indeed, an absorbing Markov chain. This is in contrast to card games such as blackjack, where the cards represent a 'memory' of the past moves. To see the difference, consider the probability for a certain event in the game. In the above-mentioned dice games, the only thing that matters is the current state of the board. The next state of the board depends on the current state, and the next roll of the dice. It doesn't depend on how things got to their current state. In a game such as blackjack, a player can gain an advantage by remembering which cards have already been shown (and hence which cards are no longer in the deck), so the next state (or hand) of the game is not independent of the past states.'
      So, for a Markov process, only the current state determines the next state; the history of the system has no impact. For that reason we describe a Markov process as memoryless. What happens next is determined completely by the current state and the transition probabilities.

      -

      In what follows, we describe a classic working of the Markov chains, next to a simplified version we used to develop a Markov game and the code for this book.

      -

      Classic version

      -

      This example is taken from the following source: https://higherkindedtripe.wordpress.com/2012/02/26/markov-chains-or-daddy-where-does-spam-come-from/

      -

      You take a piece of “training” text.
      You make a list of all the words in it.
      For each word, make a list of all the other words that come after it, with the number of times each word appears. So with the sentence: “the quick brown fox jumped over the lazy dog”, you would end up with the list:

      +

      You take a piece of “training” text.
      You make a list of all the words in it.
      For each word, make a list of + all the other words that come after it, with the number of times each word appears. So with the sentence: “the + quick brown fox jumped over the lazy dog”, you would end up with the list:

        -
      1. the -> (1, quick), (1, lazy)
      2. -
      3. quick -> (1, brown)
      4. -
      5. brown -> (1, fox)
      6. -
      7. fox -> (1, jumped)
      8. -
      9. jumped -> (1, over)
      10. -
      11. over -> (1, the)
      12. -
      13. lazy -> (1, dog)
      14. -
      15. dog ->
      16. +
      17. the → (1, quick), (1, lazy)
      18. +
      19. quick → (1, brown)
      20. +
      21. brown → (1, fox)
      22. +
      23. fox → (1, jumped)
      24. +
      25. jumped → (1, over)
      26. +
      27. over → (1, the)
      28. +
      29. lazy → (1, dog)
      30. +
      31. dog →
      -

      Turn the list into a matrix, where the rows represent the “leading” words and the columns represent “following” words, and each number in the matrix says how many times the following word appeared after the leading word. You will get:

      +

      Turn the list into a matrix, where the rows represent the “leading” words and the columns represent “following” + words, and each number in the matrix says how many times the following word appeared after the leading word. You + will get:

      - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      thequickbrownfoxjumpedoverlazydog
      the01000010
      quick00100000
      brown00010000
      fox00001000
      jumped00000100
      over00000010
      lazy00000001
      dog00000000
      thequickbrownfoxjumpedoverlazydog
      the01000010
      quick00100000
      brown00010000
      fox00001000
      jumped00000100
      over00000010
      lazy00000001
      dog00000000
      -

      Divide every number in the matrix by the total of its row, and you’ll notice that each row becomes a sort of probability distribution.

      +

      Divide every number in the matrix by the total of its row, and you’ll notice that each row becomes a sort of + probability distribution.

      - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      thequickbrownfoxjumpedoverlazydog
      the00.500000.50
      quick00100000
      brown00010000
      fox00001000
      jumped00000100
      over00000010
      lazy00000001
      dog00000000
      thequickbrownfoxjumpedoverlazydog
      the00.500000.50
      quick00100000
      brown00010000
      fox00001000
      jumped00000100
      over00000010
      lazy00000001
      dog00000000
      -

      You can interpret this as saying “if the first word is a ‘the’ there’s a 50% chance the next word is ‘quick’, and a 50% chance the next word is ‘lazy’. For all the other words, there is only one possible word following it.”
      Almost every word has only one possible following word because the text is so short. But, if you train it with a larger text, and interpret the rows as a probability distribution, you can start to see for every word what sort of word tends to follow it. This gives a very interesting insight into the nature of written text.
      If you take that big “transition matrix” you’ve trained from a large text, you can use it to actually generate new text in the following way:

      +

      You can interpret this as saying “if the first word is a ‘the’ there’s a 50% chance the next word is ‘quick’, and + a 50% chance the next word is ‘lazy’. For all the other words, there is only one possible word following + it.”
      Almost every word has only one possible following word because the text is so short. But, if you train it + with a larger text, and interpret the rows as a probability distribution, you can start to see for every word what + sort of word tends to follow it. This gives a very interesting insight into the nature of written text.
      If you + take that big “transition matrix” you’ve trained from a large text, you can use it to actually generate new text + in the following way:

        -
      1. Pick a “seed” word from the text at random. For best results use one with many possible following words.

        -
      2. -
      3. Find the row in the matrix corresponding to that word. Choose the next word at random, weighted according to the probabilities in the row. That is, if the column corresponding to the word “blue” has the number .05 in it, you have a 5% chance of picking “blue” as the next word, and so on (when we divided each number by the total of its row we made sure that these probabilities would add up to 1).

        -
      4. -
      5. Go back to step 2 using this second word as the new “seed” word. Continue this process to generate as long a string of words as you want. If you end up with a word for which no other words follow it (uncommon when you train on a large test, but possible – imagine if the last word of a novel was the only occurrence of the word “xylophone”, or whatever), just pick a random word.

        -
      6. +
      7. +

        Pick a “seed” word from the text at random. For best results use one with many possible following words.

        +
      8. +
      9. +

        Find the row in the matrix corresponding to that word. Choose the next word at random, weighted according to + the probabilities in the row. That is, if the column corresponding to the word “blue” has the number .05 in + it, you have a 5% chance of picking “blue” as the next word, and so on (when we divided each number by the + total of its row we made sure that these probabilities would add up to 1).

        +
      10. +
      11. +

        Go back to step 2 using this second word as the new “seed” word. Continue this process to generate as long a + string of words as you want. If you end up with a word for which no other words follow it (uncommon when you + train on a large test, but possible – imagine if the last word of a novel was the only occurrence of the word + “xylophone”, or whatever), just pick a random word.

        +
      -

      You can see how strings of words generated with this method will follow the “trends” of the training data, meaning that if you were to generate a new transition matrix from the generated words it would, on average, look the same as the original transition matrix since you picked the words according to those weights. This completely mechanical process can generate data which looks, statistically, like meaningful English. Of course, it is not necessarily grammatical, and is certainly devoid of higher meaning since it was generated through this simplistic process.

      -

      Those “chains” of words constructed by the above process are an example of Markov chains. And they are also the answer to the question “where does spam come from?”. Those uncannily-almost-grammatical ramblings below the “Viagra” ads, generated through the above process, are the spam-creators way of fooling your spam filter. They include these chains to give their advertisements statistical similarity to meaningful human correspondence. This works because the spam filters are (at least in part) using probabilistic models that depend on word-transitions and word frequencies to classify incoming email as spam. The spammers and the filter-writers are engaged in an eternal game of randomly-generated cat-and-mouse.

      +

      You can see how strings of words generated with this method will follow the “trends” of the training data, + meaning that if you were to generate a new transition matrix from the generated words it would, on average, look + the same as the original transition matrix since you picked the words according to those weights. This completely + mechanical process can generate data which looks, statistically, like meaningful English. Of course, it is not + necessarily grammatical, and is certainly devoid of higher meaning since it was generated through this simplistic + process.

      +

      Those “chains” of words constructed by the above process are an example of Markov chains. And they are also the + answer to the question “where does spam come from?”. Those uncannily-almost-grammatical ramblings below the + “Viagra” ads, generated through the above process, are the spam-creators way of fooling your spam filter. They + include these chains to give their advertisements statistical similarity to meaningful human correspondence. This + works because the spam filters are (at least in part) using probabilistic models that depend on word-transitions + and word frequencies to classify incoming email as spam. The spammers and the filter-writers are engaged in an + eternal game of randomly-generated cat-and-mouse.

      Simplified version

      -

      With Algolit, an artistic research group on libre code and text based in Brussels, we developed a Markov Chain game with sentences and cards. This happened as part of the festival Désert Numérique, in La Drôme in France in 2014. The game was developed by Brendan Howell, Catherine Lenoble and An Mertens. You can listen back to the radio show: http://desert.numerique.free.fr//archives/?id=1011&ln=fr.
      Next, the game was presented at Transmediale in Berlin in 2015, respecting the following rules.

      -
        -
      1. We take a text, for example:

        -
        -

        Cqrrelations read as poetry to statisticians. Can statisticians read poetry with machines?Cqrrelations is a practise for artists, for datatravellers, statisticians and other lovers of machines to explore a world of blurry categorisations and crummylations. Machines correlate to dissidents, dissidents correlate to statisticians.

        -
        -
      2. -
      3. We create a database for this text; each word is an entry and takes the following word as a possible value. The entry for ‘Cqrrelations’ will have two values:

        +

        With Algolit, an artistic research group on libre code and text based in + Brussels, we developed a Markov Chain game with sentences and cards. This happened as part of the festival Désert + Numérique, in La Drôme in France in 2014. The game was developed by Brendan Howell, Catherine Lenoble and An + Mertens. You can listen back to the radio show: http://desert.numerique.free.fr//archives/?id=1011&ln=fr.
        Next, + the game was presented at Transmediale in Berlin in 2015, respecting the following rules.

          -
        1. read
        2. -
        3. is
        4. -
        -
      4. -
      5. Once the database is created, we choose a starting word for a new text, for ex. Cqrrelations.

        -
      6. -
      7. We roll the dice, odd numbers will give ‘read’ as the 2nd word of our text; even numbers will give ‘is’ as the 2nd word.
      8. -
      9. We roll the dice again, and choose a word amongst the values of the chosen word. This gives the next word of our sentence.
      10. -
      11. We continue 5 till we arrive at a word with a period (.)
      12. -
      13. We can repeat rule 3 till 6 until we are satisfied with the amount of generated sentences
      14. +
      15. +

        We take a text, for example:

        +
        +

        Cqrrelations read as poetry to statisticians. Can statisticians read poetry with machines?Cqrrelations is a + practise for artists, for datatravellers, statisticians and other lovers of machines to explore a world of + blurry categorisations and crummylations. Machines correlate to dissidents, dissidents correlate to + statisticians.

        +
        +
      16. +
      17. +

        We create a database for this text; each word is an entry and takes the following word as a possible value. + The entry for ‘Cqrrelations’ will have two values:

        +
          +
        1. read
        2. +
        3. is
        4. +
        +
      18. +
      19. +

        Once the database is created, we choose a starting word for a new text, for ex. Cqrrelations.

        +
      20. +
      21. We roll the dice, odd numbers will give ‘read’ as the 2nd word of our text; even numbers will give ‘is’ as the + 2nd word.
      22. +
      23. We roll the dice again, and choose a word amongst the values of the chosen word. This gives the next word of + our sentence.
      24. +
      25. We continue 5 till we arrive at a word with a period (.)
      26. +
      27. We can repeat rule 3 till 6 until we are satisfied with the amount of generated sentences

      Based on the input text the output at Transmediale was:

      -

      A world of blurry categorisations and other lovers of blurry categorisations and other lovers of blurry categorisations and other lovers of machines. Cqrrelations read poetry to dissidents correlate to machines. Lovers of machines to statisticians.

      +

      A world of blurry categorisations and other lovers of blurry categorisations and other lovers of blurry + categorisations and other lovers of machines. Cqrrelations read poetry to dissidents correlate to machines. + Lovers of machines to statisticians.

      -
      -

      Code of the book

      - +
      +

      Code of the book

      + {% for path, source in sources %} +

      {{ path }}

      +
      {{ source }}
      + {% endfor %}
      -
      +

      Credits

      -

      This book is a creation of Anaïs Berck for Medialab as part of the programme "Residency Digital Cultur" initiated by the Flemish Government. - In this work Anaïs Berck is represented by:

      +

      This book is a creation of Anaïs Berck for Medialab as part of the programme "Residency Digital Cultur" + initiated by the Flemish Government. + In this work Anaïs Berck is represented by:

        -
      • the Markov chains algorithm, of which a description is given in this book,
      • -
      • the trees of Madrid, which are geolocated between Medialab Prado, Plaza del Sol and Atocha Renfe, and present in the database Un Alcorque, un Árbol,
      • -
      • the human beings Emilia Pardo Bazán, Benito Pérez Gáldos, Jaime Munárriz, Luis Morell, An Mertens, Eva Marina Gracia, Gijs de Heij, Ana Isabel Garrido Mártinez, Alfredo Calosci, Daniel Arribas Hedo.
      • +
      • the Markov chains algorithm, of which a description is given in this book,
      • +
      • the trees of Madrid, which are geolocated between Medialab Prado, Plaza del Sol and Atocha Renfe, and present + in the database Un Alcorque, un Árbol,
      • +
      • the human beings Emilia Pardo Bazán, Benito Pérez Gáldos, Jaime Munárriz, Luis Morell, An Mertens, Eva Marina + Gracia, Gijs de Heij, Ana Isabel Garrido Mártinez, Alfredo Calosci, Daniel Arribas Hedo.

      The copy of this book is unique and the print run is by definition infinite.
      - This copy is the {{ edition_count }} number of copies downloaded.

      -

      Collective terms of (re)use (CC4r), 2021
      Copyleft with a difference: You are invited to copy, distribute, and modify this work under the terms of the work under the terms of the CC4r.

      + This copy is the {{ edition_count }} number of copies downloaded.

      +

      Collective terms of (re)use (CC4r), 2021
      Copyleft with a difference: You are invited to copy, distribute, and + modify this work under the terms of the work under the terms of the CC4r.

      diff --git a/scripts/templates/index.html b/scripts/templates/index.html index a9128c6..32a92b4 100644 --- a/scripts/templates/index.html +++ b/scripts/templates/index.html @@ -68,6 +68,15 @@ font-size: 34pt; line-height: 45pt; margin-top: 0; + margin-bottom: 0; + } + + a { + color: currentColor; + } + + a:hover { + text-decoration: none; } p { @@ -157,7 +166,10 @@ -

      Paseo por los árboles de Madrid

      +
      +

      Paseo por los árboles de Madrid

      + en +

      En este libro, el algoritmo de las cadenas de Markov genera simultáneamente un poema y un paseo por los árboles del barrio de Las Letras, en el centro de Madrid. A pesar de la impresión de que hay pocos árboles en el barrio, el algoritmo cuenta con 460 de ellos.

      La cadena de Markov fue diseñada en 1906 por Andrey Markov, un matemático ruso fallecido en 1992. Este algoritmo está en la base de muchos programas informáticos que generan spam. Se utiliza para sistemas que describen una serie de eventos que son interdependientes. Lo que ocurre depende únicamente del paso anterior.

      diff --git a/scripts/templates/index_en.html b/scripts/templates/index_en.html index 3704538..7e38cdb 100644 --- a/scripts/templates/index_en.html +++ b/scripts/templates/index_en.html @@ -68,6 +68,15 @@ font-size: 34pt; line-height: 45pt; margin-top: 0; + margin-bottom: 0; + } + + a { + color: currentColor; + } + + a:hover { + text-decoration: none; } p { @@ -157,7 +166,10 @@ -

      Walk along the trees of Madrid

      +
      +

      Walk along the trees of Madrid

      + es +

      In this book, the Markov chain algorithm simultaneously generates a poem and a walk along the trees of the neighbourhood Las Letras in the centre of Madrid. Despite the impression that there are few trees in the neighbourhood, the algorithm counts 460 of them.

      The Markov chain was designed in 1906 by Andrey Markov, a Russian mathematician who died in 1992. This algorithm is at the basis of many computer programs that generate spam. It is used for systems that describe a series of events that are interdependent. What happens depends only on the previous step.