|
|
<!DOCTYPE html>
|
|
|
<html lang="en">
|
|
|
|
|
|
<head>
|
|
|
<meta charset="UTF-8">
|
|
|
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
|
<title>Document</title>
|
|
|
<style>
|
|
|
@font-face {
|
|
|
font-family: Serreria;
|
|
|
src: url("{{ STATIC_DIR }}/MFI-Serreria/MFI-Serreria-Extravagante.otf") format('opentype');
|
|
|
font-weight: normal;
|
|
|
font-style: normal;
|
|
|
}
|
|
|
|
|
|
@font-face {
|
|
|
font-family: PTSerif;
|
|
|
src: url("{{ STATIC_DIR }}/PT_Serif/PTSerif-Regular.ttf"f) format('truetype');
|
|
|
font-weight: normal;
|
|
|
font-style: normal;
|
|
|
}
|
|
|
|
|
|
@font-face {
|
|
|
font-family: PTSerif;
|
|
|
src: url("{{ STATIC_DIR }}/PT_Serif/PTSerif-Italic.ttf") format('truetype');
|
|
|
font-weight: normal;
|
|
|
font-style: italic;
|
|
|
}
|
|
|
|
|
|
@font-face {
|
|
|
font-family: "Fira mono";
|
|
|
src: url("{{ STATIC_DIR }}/fira-mono/FiraMono-Regular.ttf") format("truetype");
|
|
|
font-weight: 400;
|
|
|
font-style: normal;
|
|
|
}
|
|
|
|
|
|
@font-face {
|
|
|
font-family: "Fira mono";
|
|
|
src: url("{{ STATIC_DIR }}/fira-mono/FiraMono-Medium.ttf") format("truetype");
|
|
|
font-weight: 500;
|
|
|
font-style: normal;
|
|
|
}
|
|
|
|
|
|
@font-face {
|
|
|
font-family: "Fira mono";
|
|
|
src: url("{{ STATIC_DIR }}/fira-mono/FiraMono-Bold.ttf") format("truetype");
|
|
|
font-weight: 600;
|
|
|
font-style: bold;
|
|
|
}
|
|
|
|
|
|
@page {
|
|
|
size: 210mm 297mm;
|
|
|
margin: 10mm 20mm 15mm 20mm;
|
|
|
}
|
|
|
|
|
|
@page chain {
|
|
|
margin: 0mm 10mm 15mm 10mm;
|
|
|
}
|
|
|
|
|
|
@page title {
|
|
|
background: green;
|
|
|
margin: 10mm 10mm 15mm 10mm;
|
|
|
}
|
|
|
|
|
|
@page:left {
|
|
|
@bottom-left {
|
|
|
text-align: left;
|
|
|
content: counter(page);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@page:right {
|
|
|
@bottom-right {
|
|
|
text-align: right;
|
|
|
content: counter(page);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@page:empty {
|
|
|
@bottom-right {
|
|
|
content: '';
|
|
|
}
|
|
|
|
|
|
@bottom-left {
|
|
|
content: '';
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@page title {
|
|
|
@bottom-right {
|
|
|
content: '';
|
|
|
}
|
|
|
|
|
|
@bottom-left {
|
|
|
content: '';
|
|
|
}
|
|
|
}
|
|
|
|
|
|
#chapter-chain {
|
|
|
page: chain;
|
|
|
}
|
|
|
|
|
|
:root {
|
|
|
--font-size: 10pt;
|
|
|
--line-height: 15pt;
|
|
|
}
|
|
|
|
|
|
html,
|
|
|
body {
|
|
|
font-family: PTSerif;
|
|
|
font-size: var(--font-size);
|
|
|
line-height: var(--line-height);
|
|
|
}
|
|
|
|
|
|
h1 {
|
|
|
page: title;
|
|
|
color: white;
|
|
|
page-break-after: right;
|
|
|
font-family: Serreria;
|
|
|
font-weight: normal;
|
|
|
font-size: 24pt;
|
|
|
line-height: 30pt;
|
|
|
}
|
|
|
|
|
|
h1 em {
|
|
|
text-decoration: underline;
|
|
|
font-style: normal;
|
|
|
}
|
|
|
|
|
|
h2 {
|
|
|
font-family: Serreria;
|
|
|
font-size: 18pt;
|
|
|
line-height: 24pt;
|
|
|
}
|
|
|
|
|
|
ul {
|
|
|
margin: 0 0 0 1.1em;
|
|
|
padding: 0;
|
|
|
list-style-type: none;
|
|
|
}
|
|
|
|
|
|
ul li:before {
|
|
|
content: '– ';
|
|
|
position: absolute;
|
|
|
margin-left: -1.1em;
|
|
|
}
|
|
|
|
|
|
section.step {
|
|
|
page-break-before: always;
|
|
|
text-align: center;
|
|
|
height: 282mm;
|
|
|
/* position: relative; */
|
|
|
margin: 0;
|
|
|
padding: 0;
|
|
|
}
|
|
|
|
|
|
section.step.first {
|
|
|
page-break-before: right;
|
|
|
}
|
|
|
|
|
|
section.step_content {
|
|
|
overflow: hidden;
|
|
|
position: absolute;
|
|
|
top: 20mm;
|
|
|
left: 0;
|
|
|
right: 0;
|
|
|
bottom: 0mm;
|
|
|
}
|
|
|
|
|
|
.traces {
|
|
|
list-style-type: none;
|
|
|
display: flex;
|
|
|
flex-direction: row;
|
|
|
position: absolute;
|
|
|
top: 50%;
|
|
|
left: 0;
|
|
|
right: 0;
|
|
|
justify-content: center;
|
|
|
line-height: 1.2em;
|
|
|
margin: 0;
|
|
|
padding: 0;
|
|
|
vertical-align: text-bottom;
|
|
|
}
|
|
|
|
|
|
.options {
|
|
|
list-style-type: none;
|
|
|
margin: 0 4em 0 0;
|
|
|
}
|
|
|
|
|
|
.options li {
|
|
|
line-height: var(--line-height);
|
|
|
height: var(--line-height);
|
|
|
margin: 0;
|
|
|
padding: 0;
|
|
|
}
|
|
|
|
|
|
.traces li:before,
|
|
|
.options li:before {
|
|
|
content: '';
|
|
|
}
|
|
|
|
|
|
.sentence {
|
|
|
z-index: 1;
|
|
|
position: absolute;
|
|
|
top: 3mm;
|
|
|
padding: 0.5em 20mm 3em 20mm;
|
|
|
width: 100%;
|
|
|
box-sizing: border-box;
|
|
|
/* background: -webkit-linear-gradient(to top, rgba(255,255,255,0), white 2.5em); */
|
|
|
/* background: linear-gradient(to top, rgba(255,255,255,0), white 2.5em); */
|
|
|
/* background: white; */
|
|
|
}
|
|
|
|
|
|
.tree {
|
|
|
z-index: 1;
|
|
|
position: absolute;
|
|
|
padding: 0 20mm 0 20mm;
|
|
|
bottom: -10mm;
|
|
|
width: 100%;
|
|
|
box-sizing: border-box;
|
|
|
/* background: -webkit-linear-gradient(to top, rgba(255,255,255,0), white 2.5em); */
|
|
|
/* background: linear-gradient(to bottom, rgba(255,255,255,0), white 2.5em); */
|
|
|
/* background: white; */
|
|
|
}
|
|
|
|
|
|
|
|
|
[data-picked] {
|
|
|
text-decoration: underline;
|
|
|
position: relative;
|
|
|
/* font-style: italic; */
|
|
|
}
|
|
|
|
|
|
.last-word-previous-step [data-picked] {
|
|
|
text-decoration: none;
|
|
|
}
|
|
|
|
|
|
[data-picked]::after {
|
|
|
content: ' → ';
|
|
|
text-decoration: none;
|
|
|
position: absolute;
|
|
|
left: calc(100% + 1.5em);
|
|
|
top: 0;
|
|
|
height: 1.2em;
|
|
|
display: block;
|
|
|
}
|
|
|
|
|
|
.traces> :last-child [data-picked]::after {
|
|
|
display: none;
|
|
|
}
|
|
|
|
|
|
.chapter {
|
|
|
page-break-before: right;
|
|
|
}
|
|
|
|
|
|
.chapter#chapter-introduction,
|
|
|
.chapter#chapter-description,
|
|
|
.chapter#chapter-technical-description,
|
|
|
.chapter#chapter-credits {
|
|
|
width: 65%;
|
|
|
}
|
|
|
|
|
|
.pagedjs_right_page .chapter#chapter-introduction,
|
|
|
.pagedjs_right_page .chapter#chapter-description,
|
|
|
.pagedjs_right_page .chapter#chapter-technical-description,
|
|
|
.pagedjs_right_page .chapter#chapter-credits {
|
|
|
margin-left: 30%;
|
|
|
}
|
|
|
|
|
|
.pagedjs_right_page .sources {
|
|
|
float: left;
|
|
|
margin-left: calc(-1 * (40% + 3em));
|
|
|
width: 40%;
|
|
|
}
|
|
|
|
|
|
|
|
|
.pagedjs_left_page .sources {
|
|
|
float: left;
|
|
|
margin-right: calc(-1 * (100% + 3em));
|
|
|
width: 40%;
|
|
|
}
|
|
|
|
|
|
.sources a {
|
|
|
word-break: break-all;
|
|
|
}
|
|
|
|
|
|
a {
|
|
|
text-decoration: underline dotted;
|
|
|
color: currentColor;
|
|
|
}
|
|
|
|
|
|
pre, code, table {
|
|
|
font-family: "Fira mono";
|
|
|
font-size: .8rem;
|
|
|
}
|
|
|
|
|
|
th {
|
|
|
font-weight: normal;
|
|
|
}
|
|
|
|
|
|
h4 {
|
|
|
font-size: 1rem;
|
|
|
}
|
|
|
|
|
|
h4:first-child {
|
|
|
margin-top: 0;
|
|
|
}
|
|
|
|
|
|
|
|
|
.pagedjs_right_page p.images {
|
|
|
margin-left: -25%;
|
|
|
}
|
|
|
|
|
|
.pagedjs_left_page p.images {
|
|
|
margin-right: -25%;
|
|
|
}
|
|
|
|
|
|
img {
|
|
|
display: inline-block;
|
|
|
max-width: 45%;
|
|
|
vertical-align: top;
|
|
|
}
|
|
|
|
|
|
.complete_sentence {
|
|
|
height: 282mm;
|
|
|
text-align: center;
|
|
|
display: flex;
|
|
|
flex-direction: column;
|
|
|
justify-content: center;
|
|
|
page-break-before: right;
|
|
|
page-break-after: left;
|
|
|
padding-left: 15mm;
|
|
|
padding-right: 15mm;
|
|
|
}
|
|
|
</style>
|
|
|
{% if DEBUG %}
|
|
|
<link href="{{ STATIC_DIR }}/pagedjs-interface.css" rel="stylesheet" type="text/css">
|
|
|
<script src="https://unpkg.com/pagedjs/dist/paged.polyfill.js"></script>
|
|
|
{% endif %}
|
|
|
</head>
|
|
|
|
|
|
<body>
|
|
|
<h1>Walk along the trees of Madrid with <em>{{ author }}</em> and {{ title }}</h1>
|
|
|
|
|
|
|
|
|
<section class="chapter" id="chapter-chain">
|
|
|
<h2 id="poema-paseo">Walk along the trees of Madrid with <em>{{ author }}</em> and {{ title }}</h2>
|
|
|
{% for sentence, previous_steps, tree, traces, last_word_previous_step in path %}
|
|
|
<section class="step{% if loop.first %} first{% endif %}">
|
|
|
<section class="sentence">
|
|
|
{{ previous_steps }}
|
|
|
</section>
|
|
|
<section class="step_content">
|
|
|
<ul class="traces">
|
|
|
{% if last_word_previous_step %}
|
|
|
<li class="last-word-previous-step">
|
|
|
<ul class="options">
|
|
|
<li data-picked>{{ last_word_previous_step}}</li>
|
|
|
</ul>
|
|
|
</li>
|
|
|
{% endif %}
|
|
|
{% for word, dice, options in traces %}
|
|
|
<li style="margin-top: calc(-{{ dice }} * var(--line-height))">
|
|
|
<ul class="options">
|
|
|
{% for option in options %}
|
|
|
<li {% if loop.index0==dice %}data-picked{% endif %}>
|
|
|
{{ option }}
|
|
|
</li>
|
|
|
{% endfor %}
|
|
|
</ul>
|
|
|
<!-- Rolled: {{ dice }} -->
|
|
|
</li>
|
|
|
{% endfor %}
|
|
|
</ul>
|
|
|
</section>
|
|
|
<section class="tree">
|
|
|
{{ tree.properties.NOMBRE_COMUN }} on {{ tree.properties.MINTDIRECCIONAUX }}
|
|
|
</section>
|
|
|
</section>
|
|
|
{% endfor %}
|
|
|
<section class="complete_sentence">
|
|
|
{{ complete_sentence }}
|
|
|
</section>
|
|
|
</section>
|
|
|
|
|
|
|
|
|
<section class="chapter" id="chapter-introduction">
|
|
|
<h2 id="introducci-n">Introduction</h2>
|
|
|
<p>Walk along the trees of Madrid is a book in the <em>An Algoliterary Publishing House: making kin with
|
|
|
trees</em>.
|
|
|
<br>The author of this book is the Markov chains algorithm. It simultaneously generates a poem and a
|
|
|
walk along the trees of the neighbourhood Las Letras in the centre of Madrid.<br>The poem is created from a novel
|
|
|
chosen by the reader. The reader has the choice between two novels by great Spanish writers of the 19th century:
|
|
|
</p>
|
|
|
<ul>
|
|
|
<li><em>The Swan of Vila Morta</em> by the feminist writer Emilia Pardo Bazán published in 1891. </li>
|
|
|
<li><em>Marianela</em> by the writer Benito Pérez Galdós, published in 1878. </li>
|
|
|
</ul>
|
|
|
<p>The walk is generated from the database with trees in Madrid, <a
|
|
|
href="http://www-2.munimadrid.es/DGPVE_WUAUA/welcome.do">Un Alcorque, un Árbol</a>. Each significant word -
|
|
|
noun, adjective, verb or adverb - is related to a tree in Madrid's neighbourhood las Letras. The other words
|
|
|
create the path between the different trees. Thus one can walk through the neighbourhood reciting parts of the
|
|
|
poem to each tree along the promenade.<br>This book is by definition infinite and unique.<br>It is created by
|
|
|
Anaïs Berck. It is a pseudonym that represents a collaboration between humans, algorithms and trees. Anaïs Berck
|
|
|
explores the specificities of human intelligence in the company of artificial and plant intelligences.<br>An
|
|
|
Algoliterary Publishing is a collection of publications in which algorithms are the authors of unusual books. This
|
|
|
book was created as part of a residency at the center for contemporary arts Medialab Prado in Madrid. The
|
|
|
residency was granted by the programme "Residency Digital Culture" initiated by the Flemish Government.
|
|
|
</p>
|
|
|
<p>In this work Anaïs Berck is represented by:</p>
|
|
|
<ul>
|
|
|
<li>the Markov chains algorithm, of which a description is given in this book,</li>
|
|
|
<li>the trees of Madrid, which are geolocated between Medialab Prado, Plaza del Sol and Atocha Renfe, and present
|
|
|
in the database <a href="http://www-2.munimadrid.es/DGPVE_WUAUA/welcome.do">Un Alcorque, un Árbol</a>,</li>
|
|
|
<li>the human beings Emilia Pardo Bazán, Benito Pérez Gáldos, Jaime Munárriz, Luis Morell, An Mertens, Eva Marina
|
|
|
Gracia, Gijs de Heij, Ana Isabel Garrido Mártinez, Alfredo Calosci, Daniel Arribas Hedo.</li>
|
|
|
</ul>
|
|
|
</section>
|
|
|
|
|
|
<section class="chapter" id="chapter-description">
|
|
|
<h2 id="general-description-of-the-markov-chains">General description of the Markov Chains</h2>
|
|
|
<section class="sources">
|
|
|
<h4 id="sources">Sources</h4>
|
|
|
<ul>
|
|
|
<li>
|
|
|
<a
|
|
|
href="https://spectrum.ieee.org/andrey-markov-and-claude-shannon-built-the-first-language-generation-models">https://spectrum.ieee.org/andrey-markov-and-claude-shannon-built-the-first-language-generation-models</a>
|
|
|
</li>
|
|
|
<li><a href="http://langvillea.people.cofc.edu/MCapps7.pdf">http://langvillea.people.cofc.edu/MCapps7.pdf</a></li>
|
|
|
<li>
|
|
|
<a
|
|
|
href="https://www.irishtimes.com/news/science/that-s-maths-andrey-markov-s-brilliant-ideas-are-still-bearing-fruit-1.3220929">https://www.irishtimes.com/news/science/that-s-maths-andrey-markov-s-brilliant-ideas-are-still-bearing-fruit-1.3220929</a>
|
|
|
</li>
|
|
|
<li>
|
|
|
<a
|
|
|
href="http://www.alpha60.de/research/markov/DavidLink_TracesOfTheMouth_2006.pdf">http://www.alpha60.de/research/markov/DavidLink_TracesOfTheMouth_2006.pdf</a>
|
|
|
</li>
|
|
|
</ul>
|
|
|
</section>
|
|
|
<h3 id="historias">Histories</h3>
|
|
|
<p>Andrey Andreyevich Markov was a Russian mathematician who lived between 1856 and 1922. His most famous studies
|
|
|
were with Markov chains, an algorithm that allows to predict future changes once one knows the current state . The
|
|
|
first paper on the subject was published in 1906. He was also interested in literature. He tried establishing a
|
|
|
linguistic mathematical model using Markov Chains by manually counting letters of Alexander Pusjkins verse novel
|
|
|
Eugene Onegin. Next, he applied the method to the novel Childhood Years of Bagrov's Grandson by S.T.
|
|
|
Aksakov. This links the Markov Chains directly to the field of literature, text and language. And the link will
|
|
|
live firmly throughout the history of this algorithm.<br>The following text is based on Oscar Schwartz'
|
|
|
article for IEEE Spectrum, <a
|
|
|
href="https://spectrum.ieee.org/andrey-markov-and-claude-shannon-built-the-first-language-generation-models">Andrey
|
|
|
Markov & Claude Shannon Counted Letters to Build the First Language-Generation Models</a>.<br>In 1913,
|
|
|
Andrey Markov sat down in his study in St. Petersburg with a copy of Alexander Pushkin’s 19th century verse novel,
|
|
|
<a href="https://en.wikipedia.org/wiki/Eugene_Onegin">Eugene Onegin</a>, a literary classic at the time. This work
|
|
|
comprises almost 400 stanzas of iambic tetrameter. Markov, however, did not start reading Pushkin’s famous text.
|
|
|
Rather, he took a pen and piece of drafting paper, and wrote out the first 20,000 letters of the book in one long
|
|
|
string of letters, eliminating all punctuation and spaces. Then he arranged these letters in 200 grids (10-by-10
|
|
|
characters each) and began counting the vowels in every row and column, tallying the results.<br>In separating the
|
|
|
vowels from the consonants, Markov was testing a theory of probability that he had developed in 1906 and that we
|
|
|
now call a Markov Process or Markov Chain. Up until that point, the field of probability had been mostly limited
|
|
|
to analyzing phenomena like roulette or coin flipping, where the outcome of previous events does not change the
|
|
|
probability of current events. But Markov felt that most things happen in chains of causality and are dependent on
|
|
|
prior outcomes. He wanted a way of modeling these occurrences through probabilistic analysis.<br>Language, Markov
|
|
|
believed, was an example of a system where past occurrences partly determine present outcomes. To demonstrate
|
|
|
this, he wanted to show that in a text like Pushkin’s novel, the chance of a certain letter appearing at some
|
|
|
point in the text is dependent, to some extent, on the letter that came before it. </p>
|
|
|
<p>To do so, Markov began counting vowels in Eugene Onegin, and found that 43 percent of letters were vowels and 57
|
|
|
percent were consonants. Then Markov separated the 20,000 letters into pairs of vowels and consonant combinations.
|
|
|
He found that there were 1,104 vowel-vowel pairs, 3,827 consonant-consonant pairs, and 15,069 vowel-consonant and
|
|
|
consonant-vowel pairs. What this demonstrated, statistically speaking, was that for any given letter in Pushkin’s
|
|
|
text, if it was a vowel, odds were that the next letter would be a consonant, and vice versa. </p>
|
|
|
<p>Markov used this analysis to demonstrate that Pushkin’s Eugene Onegin wasn’t just a random distribution of
|
|
|
letters but had some underlying statistical qualities that could be modeled. The enigmatic research paper that
|
|
|
came out of this study, entitled <a href="http://cs.petrsu.ru/~olbgvl/greatapp/my_project/example_eng.html">An
|
|
|
Example of Statistical Investigation of the Text Eugene Onegin Concerning the Connection of Samples in
|
|
|
Chains</a> was not widely cited in Markov’s lifetime, and not translated to English until 2006. Markov was
|
|
|
forced to stop his letter-counting experiments, when he had nearly completely lost his sight due to glaucoma. Even
|
|
|
if Markov had had more time and better eyesight to carry his experiments further, extensions would have been very
|
|
|
difficult to complete, given the precomputer era he lived in, when computational efforts had to be paid in
|
|
|
man-years. </p>
|
|
|
<p class="images"><img src="{{ STATIC_DIR }}/images/markov_1.jpeg" alt="">
|
|
|
<img src="{{ STATIC_DIR }}/images/markov_2.jpeg" alt=""><br>These images show Markov’s original notes in computing the probabilities
|
|
|
needed for his Pushkin chain.
|
|
|
</p>
|
|
|
<h3 id="influence">Influence</h3>
|
|
|
<p>Some of Markov's central concepts around probability and language spread across the globe, eventually finding
|
|
|
re-articulation in Claude Shannon’s hugely influential paper, <a
|
|
|
href="https://people.math.harvard.edu/~ctm/home/text/others/shannon/entropy/entropy.pdf">A Mathematical Theory
|
|
|
of Communication</a> which came out in 1948.<br>Shannon’s paper outlined a way to precisely measure the quantity
|
|
|
of information in a message, and in doing so, set the foundations for a theory of information that would come to
|
|
|
define the digital age. Shannon was fascinated by Markov’s idea that in a given text, the likelihood of some
|
|
|
letter or word appearing could be approximated. Like Markov, Shannon demonstrated this by performing some textual
|
|
|
experiments that involved making a statistical model of language, then took a step further by trying to use the
|
|
|
model to generate text according to those statistical rules.<br>In an initial control experiment, he started by
|
|
|
generating a sentence by picking letters randomly from a 27-symbol alphabet (26 letters, plus a space), and got
|
|
|
the following output: </p>
|
|
|
<p>XFOML RXKHRJFFJUJ ZLPWCFWKCYJ FFJEYVKCQSGHYD QPAAMKBZAACIBZLHJQD </p>
|
|
|
<p>The sentence was meaningless noise, Shannon said, because when we communicate we don’t choose letters with equal
|
|
|
probability. As Markov had shown, consonants are more likely than vowels. But at a greater level of granularity,
|
|
|
E’s are more common than S’s which are more common than Q’s. To account for this, Shannon amended his original
|
|
|
alphabet so that it modeled the probability of English more closely—he was 11 percent more likely to draw an E
|
|
|
from the alphabet than a Q. When he again drew letters at random from this recalibrated corpus he got a sentence
|
|
|
that came a bit closer to English. </p>
|
|
|
<p>OCRO HLI RGWR NMIELWIS EU LL NBNESEBYA TH EEI ALHENHTTPA OOBTTVA NAH BRL. </p>
|
|
|
<p>In a series of subsequent experiments, Shannon demonstrated that as you make the statistical model even more
|
|
|
complex, you get increasingly more comprehensible results. Shannon, via Markov, revealed a statistical framework
|
|
|
for the English language, and showed that by modeling this framework—by analyzing the dependent probabilities of
|
|
|
letters and words appearing in combination with each other—he could actually generate language. </p>
|
|
|
<p>The more complex the statistical model of a given text, the more accurate the language generation becomes—or as
|
|
|
Shannon put it, the greater “resemblance to ordinary English text.” In the final experiment, Shannon drew from a
|
|
|
corpus of words instead of letters and achieved the following: </p>
|
|
|
<p>THE HEAD AND IN FRONTAL ATTACK ON AN ENGLISH WRITER THAT THE CHARACTER OF THIS POINT IS THEREFORE ANOTHER METHOD
|
|
|
FOR THE LETTERS THAT THE TIME OF WHO EVER TOLD THE PROBLEM FOR AN UNEXPECTED. </p>
|
|
|
<p>For both Shannon and Markov, the insight that language’s statistical properties could be modeled offered a way to
|
|
|
re-think broader problems that they were working on. For Markov, it extended the study of stochasticity beyond
|
|
|
mutually independent events, paving the way for a new era in probability theory. For Shannon, it helped him
|
|
|
formulate a precise way of measuring and encoding units of information in a message, which revolutionized
|
|
|
telecommunications and, eventually, digital communication. But their statistical approach to language modeling and
|
|
|
generation also ushered in a new era for natural language processing, which has ramified through the digital age
|
|
|
to this day. As David Link notes in his article, Traces of the Mouth, Markov's efforts in retrospect
|
|
|
“represent an early and momentous attempt to understand the phenomenon of language in mathematical terms.”
|
|
|
It's not an exaggeration to say that Markov's analysis of text is in principle similar to what Google and
|
|
|
other firms now routinely carry out on a massive scale: analyzing words in books and internet documents, the order
|
|
|
in which the words occur, analyzing search phrases, detecting spam and so on. </p>
|
|
|
<h3 id="applications">Applications</h3>
|
|
|
<p>Since Markov chains can be designed to model many real-world processes, they are used in a wide variety of
|
|
|
situations. They appear in physics and chemistry when probabilities are used for unknown quantities. In
|
|
|
information processing, they have a role in pattern recognition, automatic speech analysis and synthesis and data
|
|
|
compression. They are used by meteorologists, ecologists and biologists. Other applications include the control of
|
|
|
driverless cars, machine translation, queuing patterns, and prediction of population growth, asset prices,
|
|
|
currency exchange rates and market upheavals. Also artists have used Markov chains, such as musician Iannis
|
|
|
Xenakis who developed “Free Stochastic Music” based on Markov chains. </p>
|
|
|
<p>In 2006 – the 100th anniversary of Markov's paper – Philipp Von Hilgers and Amy Langville summarized the <a
|
|
|
href="http://langvillea.people.cofc.edu/MCapps7.pdf">five greatest applications of Markov chains</a>. This
|
|
|
includes the one that is used by most of us on a daily basis, Google's Page Rank. Every time we search on the
|
|
|
internet, the ranking of webpages is based on the solution to massive Markov chain. You can say that all the web
|
|
|
pages are states, and the links between them are transitions possessing specific probabilities. In other words, we
|
|
|
can say that no matter what you’re searching on Google, there’s a finite probability of you ending up on a
|
|
|
particular web page. If you use Gmail, you must’ve noticed their Auto-fill feature. This feature automatically
|
|
|
predicts your sentences to help you write emails quickly.<br>And last but not least, have you ever wondered why
|
|
|
spam has all those hilarious nonsensical strings of words in it? They’re pretty odd constructions, not as random
|
|
|
as if you picked words randomly out of a hat, almost grammatical much of the time, but still clearly gibberish.
|
|
|
Also here the Markov chains have taken on a lot of the work. </p>
|
|
|
</section>
|
|
|
|
|
|
<section class="chapter" id="chapter-technical-description">
|
|
|
<h2 id="technical-description-of-the-markov-chains">Technical description of the Markov Chain</h2>
|
|
|
<aside class="sources">
|
|
|
<h4>Sources</h4>
|
|
|
<ul>
|
|
|
<li>
|
|
|
<a
|
|
|
href="https://en.wikipedia.org/wiki/Examples_of_Markov_chains">https://en.wikipedia.org/wiki/Examples_of_Markov_chains</a>
|
|
|
</li>
|
|
|
<li>
|
|
|
<a
|
|
|
href="https://higherkindedtripe.wordpress.com/2012/02/26/markov-chains-or-daddy-where-does-spam-come-from/">https://higherkindedtripe.wordpress.com/2012/02/26/markov-chains-or-daddy-where-does-spam-come-from/</a>
|
|
|
</li>
|
|
|
<li>
|
|
|
<a
|
|
|
href="https://towardsdatascience.com/predicting-the-weather-with-markov-chains-a34735f0c4df">https://towardsdatascience.com/predicting-the-weather-with-markov-chains-a34735f0c4df</a>
|
|
|
</li>
|
|
|
</ul>
|
|
|
|
|
|
</aside>
|
|
|
<p>In a Markov process we can predict future changes once we know the current state. Wikipedia gives a very good
|
|
|
description of the difference between Markov chains and other systems: 'A game of snakes and ladders or any
|
|
|
other game whose moves are determined entirely by dice is a Markov chain, indeed, an absorbing Markov chain. This
|
|
|
is in contrast to card games such as blackjack, where the cards represent a 'memory' of the past moves. To
|
|
|
see the difference, consider the probability for a certain event in the game. In the above-mentioned dice games,
|
|
|
the only thing that matters is the current state of the board. The next state of the board depends on the current
|
|
|
state, and the next roll of the dice. It doesn't depend on how things got to their current state. In a game
|
|
|
such as blackjack, a player can gain an advantage by remembering which cards have already been shown (and hence
|
|
|
which cards are no longer in the deck), so the next state (or hand) of the game is not independent of the past
|
|
|
states.'<br>So, for a Markov process, only the current state determines the next state; the history of the
|
|
|
system has no impact. For that reason we describe a Markov process as memoryless. What happens next is determined
|
|
|
completely by the current state and the transition probabilities. </p>
|
|
|
<p>In what follows, we describe a classic working of the Markov chains, next to a simplified version we used to
|
|
|
develop a Markov game and the code for this book. </p>
|
|
|
<h3 id="classic-version">Classic version</h3>
|
|
|
<p>This example is taken from the following source: <a
|
|
|
href="https://higherkindedtripe.wordpress.com/2012/02/26/markov-chains-or-daddy-where-does-spam-come-from/">https://higherkindedtripe.wordpress.com/2012/02/26/markov-chains-or-daddy-where-does-spam-come-from/</a>
|
|
|
</p>
|
|
|
<p>You take a piece of “training” text.<br>You make a list of all the words in it.<br>For each word, make a list of
|
|
|
all the other words that come after it, with the number of times each word appears. So with the sentence: “the
|
|
|
quick brown fox jumped over the lazy dog”, you would end up with the list: </p>
|
|
|
<ol>
|
|
|
<li>the → (1, quick), (1, lazy) </li>
|
|
|
<li>quick → (1, brown)</li>
|
|
|
<li>brown → (1, fox)</li>
|
|
|
<li>fox → (1, jumped)</li>
|
|
|
<li>jumped → (1, over)</li>
|
|
|
<li>over → (1, the)</li>
|
|
|
<li>lazy → (1, dog)</li>
|
|
|
<li>dog →</li>
|
|
|
</ol>
|
|
|
<p>Turn the list into a matrix, where the rows represent the “leading” words and the columns represent “following”
|
|
|
words, and each number in the matrix says how many times the following word appeared after the leading word. You
|
|
|
will get: </p>
|
|
|
<table>
|
|
|
<thead>
|
|
|
<tr>
|
|
|
<th style="text-align:left"></th>
|
|
|
<th style="text-align:center">the</th>
|
|
|
<th style="text-align:center">quick</th>
|
|
|
<th style="text-align:center">brown</th>
|
|
|
<th style="text-align:center">fox</th>
|
|
|
<th style="text-align:center">jumped</th>
|
|
|
<th style="text-align:center">over</th>
|
|
|
<th style="text-align:center">lazy</th>
|
|
|
<th style="text-align:right">dog</th>
|
|
|
</tr>
|
|
|
</thead>
|
|
|
<tbody>
|
|
|
<tr>
|
|
|
<td style="text-align:left">the</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">1</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">1</td>
|
|
|
<td style="text-align:right">0</td>
|
|
|
</tr>
|
|
|
<tr>
|
|
|
<td style="text-align:left">quick</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">1</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:right">0</td>
|
|
|
</tr>
|
|
|
<tr>
|
|
|
<td style="text-align:left">brown</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">1</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:right">0</td>
|
|
|
</tr>
|
|
|
<tr>
|
|
|
<td style="text-align:left">fox</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">1</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:right">0</td>
|
|
|
</tr>
|
|
|
<tr>
|
|
|
<td style="text-align:left">jumped</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">1</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:right">0</td>
|
|
|
</tr>
|
|
|
<tr>
|
|
|
<td style="text-align:left">over</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">1</td>
|
|
|
<td style="text-align:right">0</td>
|
|
|
</tr>
|
|
|
<tr>
|
|
|
<td style="text-align:left">lazy</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:right">1</td>
|
|
|
</tr>
|
|
|
<tr>
|
|
|
<td style="text-align:left">dog</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:right">0</td>
|
|
|
</tr>
|
|
|
</tbody>
|
|
|
</table>
|
|
|
<p>Divide every number in the matrix by the total of its row, and you’ll notice that each row becomes a sort of
|
|
|
probability distribution. </p>
|
|
|
<table>
|
|
|
<thead>
|
|
|
<tr>
|
|
|
<th style="text-align:left"></th>
|
|
|
<th style="text-align:center">the</th>
|
|
|
<th style="text-align:center">quick</th>
|
|
|
<th style="text-align:center">brown</th>
|
|
|
<th style="text-align:center">fox</th>
|
|
|
<th style="text-align:center">jumped</th>
|
|
|
<th style="text-align:center">over</th>
|
|
|
<th style="text-align:center">lazy</th>
|
|
|
<th style="text-align:right">dog</th>
|
|
|
</tr>
|
|
|
</thead>
|
|
|
<tbody>
|
|
|
<tr>
|
|
|
<td style="text-align:left">the</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0.5</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0.5</td>
|
|
|
<td style="text-align:right">0</td>
|
|
|
</tr>
|
|
|
<tr>
|
|
|
<td style="text-align:left">quick</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">1</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:right">0</td>
|
|
|
</tr>
|
|
|
<tr>
|
|
|
<td style="text-align:left">brown</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">1</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:right">0</td>
|
|
|
</tr>
|
|
|
<tr>
|
|
|
<td style="text-align:left">fox</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">1</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:right">0</td>
|
|
|
</tr>
|
|
|
<tr>
|
|
|
<td style="text-align:left">jumped</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">1</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:right">0</td>
|
|
|
</tr>
|
|
|
<tr>
|
|
|
<td style="text-align:left">over</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">1</td>
|
|
|
<td style="text-align:right">0</td>
|
|
|
</tr>
|
|
|
<tr>
|
|
|
<td style="text-align:left">lazy</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:right">1</td>
|
|
|
</tr>
|
|
|
<tr>
|
|
|
<td style="text-align:left">dog</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:center">0</td>
|
|
|
<td style="text-align:right">0</td>
|
|
|
</tr>
|
|
|
</tbody>
|
|
|
</table>
|
|
|
<p>You can interpret this as saying “if the first word is a ‘the’ there’s a 50% chance the next word is ‘quick’, and
|
|
|
a 50% chance the next word is ‘lazy’. For all the other words, there is only one possible word following
|
|
|
it.”<br>Almost every word has only one possible following word because the text is so short. But, if you train it
|
|
|
with a larger text, and interpret the rows as a probability distribution, you can start to see for every word what
|
|
|
sort of word tends to follow it. This gives a very interesting insight into the nature of written text.<br>If you
|
|
|
take that big “transition matrix” you’ve trained from a large text, you can use it to actually generate new text
|
|
|
in the following way: </p>
|
|
|
<ol>
|
|
|
<li>
|
|
|
<p>Pick a “seed” word from the text at random. For best results use one with many possible following words.</p>
|
|
|
</li>
|
|
|
<li>
|
|
|
<p>Find the row in the matrix corresponding to that word. Choose the next word at random, weighted according to
|
|
|
the probabilities in the row. That is, if the column corresponding to the word “blue” has the number .05 in
|
|
|
it, you have a 5% chance of picking “blue” as the next word, and so on (when we divided each number by the
|
|
|
total of its row we made sure that these probabilities would add up to 1).</p>
|
|
|
</li>
|
|
|
<li>
|
|
|
<p>Go back to step 2 using this second word as the new “seed” word. Continue this process to generate as long a
|
|
|
string of words as you want. If you end up with a word for which no other words follow it (uncommon when you
|
|
|
train on a large test, but possible – imagine if the last word of a novel was the only occurrence of the word
|
|
|
“xylophone”, or whatever), just pick a random word.</p>
|
|
|
</li>
|
|
|
</ol>
|
|
|
<p>You can see how strings of words generated with this method will follow the “trends” of the training data,
|
|
|
meaning that if you were to generate a new transition matrix from the generated words it would, on average, look
|
|
|
the same as the original transition matrix since you picked the words according to those weights. This completely
|
|
|
mechanical process can generate data which looks, statistically, like meaningful English. Of course, it is not
|
|
|
necessarily grammatical, and is certainly devoid of higher meaning since it was generated through this simplistic
|
|
|
process. </p>
|
|
|
<p>Those “chains” of words constructed by the above process are an example of Markov chains. And they are also the
|
|
|
answer to the question “where does spam come from?”. Those uncannily-almost-grammatical ramblings below the
|
|
|
“Viagra” ads, generated through the above process, are the spam-creators way of fooling your spam filter. They
|
|
|
include these chains to give their advertisements statistical similarity to meaningful human correspondence. This
|
|
|
works because the spam filters are (at least in part) using probabilistic models that depend on word-transitions
|
|
|
and word frequencies to classify incoming email as spam. The spammers and the filter-writers are engaged in an
|
|
|
eternal game of randomly-generated cat-and-mouse. </p>
|
|
|
<h3 id="simplified-version">Simplified version</h3>
|
|
|
<p>With <a href="https://algolit.net">Algolit</a>, an artistic research group on libre code and text based in
|
|
|
Brussels, we developed a Markov Chain game with sentences and cards. This happened as part of the festival Désert
|
|
|
Numérique, in La Drôme in France in 2014. The game was developed by Brendan Howell, Catherine Lenoble and An
|
|
|
Mertens. You can listen back to the radio show: <a
|
|
|
href="http://desert.numerique.free.fr//archives/?id=1011&ln=fr">http://desert.numerique.free.fr//archives/?id=1011&ln=fr</a>.<br>Next,
|
|
|
the game was presented at Transmediale in Berlin in 2015, respecting the following rules. </p>
|
|
|
<ol>
|
|
|
<li>
|
|
|
<p>We take a text, for example:</p>
|
|
|
<blockquote>
|
|
|
<p>Cqrrelations read as poetry to statisticians. Can statisticians read poetry with machines?Cqrrelations is a
|
|
|
practise for artists, for datatravellers, statisticians and other lovers of machines to explore a world of
|
|
|
blurry categorisations and crummylations. Machines correlate to dissidents, dissidents correlate to
|
|
|
statisticians.</p>
|
|
|
</blockquote>
|
|
|
</li>
|
|
|
<li>
|
|
|
<p>We create a database for this text; each word is an entry and takes the following word as a possible value.
|
|
|
The entry for ‘Cqrrelations’ will have two values:</p>
|
|
|
<ol>
|
|
|
<li>read</li>
|
|
|
<li>is </li>
|
|
|
</ol>
|
|
|
</li>
|
|
|
<li>
|
|
|
<p>Once the database is created, we choose a starting word for a new text, for ex. Cqrrelations.</p>
|
|
|
</li>
|
|
|
<li>We roll the dice, odd numbers will give ‘read’ as the 2nd word of our text; even numbers will give ‘is’ as the
|
|
|
2nd word.</li>
|
|
|
<li>We roll the dice again, and choose a word amongst the values of the chosen word. This gives the next word of
|
|
|
our sentence.</li>
|
|
|
<li>We continue 5 till we arrive at a word with a period (.)</li>
|
|
|
<li>We can repeat rule 3 till 6 until we are satisfied with the amount of generated sentences</li>
|
|
|
</ol>
|
|
|
<p>Based on the input text the output at Transmediale was: </p>
|
|
|
<blockquote>
|
|
|
<p>A world of blurry categorisations and other lovers of blurry categorisations and other lovers of blurry
|
|
|
categorisations and other lovers of machines. Cqrrelations read poetry to dissidents correlate to machines.
|
|
|
Lovers of machines to statisticians.</p>
|
|
|
</blockquote>
|
|
|
</section>
|
|
|
|
|
|
<section class="chapter" id="chapter-code">
|
|
|
<h2 id="code">Code of the book</h2>
|
|
|
{% for path, source in sources %}
|
|
|
<h3>{{ path }}</h3>
|
|
|
<pre>{{ source }}</pre>
|
|
|
{% endfor %}
|
|
|
</section>
|
|
|
|
|
|
<section class="chapter" id="chapter-credits">
|
|
|
<h2 id="credits">Credits</h2>
|
|
|
<p>This book is a creation of Anaïs Berck for Medialab as part of the programme "Residency Digital Cultur"
|
|
|
initiated by the Flemish Government.
|
|
|
In this work Anaïs Berck is represented by:</p>
|
|
|
<ul>
|
|
|
<li>the Markov chains algorithm, of which a description is given in this book,</li>
|
|
|
<li>the trees of Madrid, which are geolocated between Medialab Prado, Plaza del Sol and Atocha Renfe, and present
|
|
|
in the database <a href="http://www-2.munimadrid.es/DGPVE_WUAUA/welcome.do">Un Alcorque, un Árbol</a>,</li>
|
|
|
<li>the human beings Emilia Pardo Bazán, Benito Pérez Gáldos, Jaime Munárriz, Luis Morell, An Mertens, Eva Marina
|
|
|
Gracia, Gijs de Heij, Ana Isabel Garrido Mártinez, Alfredo Calosci, Daniel Arribas Hedo.</li>
|
|
|
</ul>
|
|
|
<p>The copy of this book is unique and the print run is by definition infinite.<br>
|
|
|
This copy is the {{ edition_count }} number of copies downloaded. </p>
|
|
|
<p>Collective terms of (re)use (CC4r), 2021<br>Copyleft with a difference: You are invited to copy, distribute, and
|
|
|
modify this work under the terms of the work under the terms of the <a
|
|
|
href="https://gitlab.constantvzw.org/unbound/cc4r">CC4r</a>.</p>
|
|
|
</section>
|
|
|
|
|
|
</body>
|
|
|
|
|
|
</html> |