diff --git a/exp.opacity/make.py b/exp.opacity/make.py index 359f65c..4d9626f 100644 --- a/exp.opacity/make.py +++ b/exp.opacity/make.py @@ -11,7 +11,7 @@ sys.path.append('../') # importing required module import summa.summarizer -from summa.summarizer import summarize +from summa.summarizer import scored_sentences # TODO: @@ -167,7 +167,7 @@ def wiki_parse(sentences): def txt2rankedsentences(txt): # from txt to ranked sentences - return summarize(txt, split=True) + return scored_sentences(txt, split=True) # main diff --git a/make.py b/make.py deleted file mode 100644 index c1b8436..0000000 --- a/make.py +++ /dev/null @@ -1,233 +0,0 @@ -from jinja2 import Template -import os -import wikipedia -from markdown import markdown - -# importing module -import sys - -# appending a path -# sys.path.append('textrank') - -# importing required module -import summa.summarizer -from summa.summarizer import summarize - - -# TODO: -# * DONE: wiki header - -# those 3 would ask to start from the HTML itself and keep and index... -# * wiki paragraph -# * wiki hyperlinks -# * list - - -# variables -# ------------------------------------------------------------------------ - -# wikipedia_page = "forest" -# wikipedia_page = "warehouse" -# wikipedia_page = "river" -wikipedia_page = "elderflower" -# wikipedia_page = "mushroom" - -TEMPLATE_PATH = 'template.html' -HTML_PATH = 'www/index.html' - - -# utilities -# ------------------------------------------------------------------------ - -def map_value(value, min, max, new_min, new_max): - return (((value - min) / (max - min)) * (new_max - new_min)) + new_min - -def remap_score(s, min_score, max_score): - s.score = 1 - map_value(s.score, min_score, max_score, 0, 1) - return s - -def compress_score(s): - - # compress whites - s.score = s.score**3 - - # stretch + limiter - # s.score = min(map_value(s.score, 0, 1, 0, 1.5), 1) - s.score = 1 if s.score > 0.8 else s.score - - return s - - -# wikipedia -# ------------------------------------------------------------------------ - -def wikipage(pagename): - # get wikipedia page content by name of the page - - print(pagename) - wikipedia.set_lang("en") - try: - results = wikipedia.search(pagename, results=1, suggestion=False) - try: - pagename = results[0] - except IndexError: - # if there is no suggestion or search results, the page doesn't exist - raise wikipedia.PageError(pagename) - return wikipedia.WikipediaPage(pagename, redirect=True, preload=True) - except wikipedia.exceptions.DisambiguationError as e: - print(e.options) - page = '' - - return page - - -# parsing and gluing html -# ------------------------------------------------------------------------ - -def is_header(s): - - # i is the header level - i = 0 - while s.text[i] == '=' and s.text[len(s.text) - 1 - i] == '=': - i += 1 - - if i > 0: - header_text = s.text[i:(-1-i)].strip() - header_level = i - return [header_text, header_level] - -def wiki_parse(sentences): - - # TODO: doesn't work with section nesting!! - # 1. replace wikitext header with html header - # 2. add the opacity to each elements - # 3. compute an artificial score for header that is an average of the score of the section - - new_sentences = [] - - print('--- HEADERS ---') - for i in range(len(sentences)): - - s = sentences[i] - - # if sentences is header - header = is_header(s) - if header: - print(header[0]) - - # start computing the average of score of this section - current_total = 0 - current_count = 0 - next_header_found = False - j = i + 1 - - # iterating while we find next header with greatest or same level - while j < len(sentences) and not next_header_found: - - s2 = sentences[j] - s2_header = is_header(s2) - - if s2_header: - print(' ' + s2_header[0]) - if header[1] >= s2_header[1]: - # encounter header of higher level - next_header_found = True - print('X ' + s2_header[0]) - - else: - # adding every sentence to the average - current_total += s2.score - current_count += 1 - - j += 1 - - if current_count != 0: - s.score = current_total / current_count - else: - s.score = "NaN" - - s.html = ''+header[0]+'' - - # stops at the references part - if header[0] == "References" or header[0] == "See also": - break - - new_sentences.append(s) - - # not a header - else: - s.html = ''+s.text+'' - new_sentences.append(s) - - return new_sentences - - -# textrank -# ------------------------------------------------------------------------ - -def txt2rankedsentences(txt): - # from txt to ranked sentences - return summarize(txt, split=True) - - -# main -# ------------------------------------------------------------------------ - -if __name__ == '__main__': - - - # --- WIKI REQUEST --- - - # get text from wikipedia - print('--- WIKI ---') - page = wikipage(wikipedia_page) - if not page: - sys.exit("--- STOP ---") - title = '

'+page.title+'

' - text = page.content - - # print text in terminal - print('--- TXT ---') - print(text) - - - # --- APPLY TEXTRANK --- - - # apply textrank - sentences = txt2rankedsentences(text) - - # print ranked sentences in terminal - print('--- SENTENCES ---') - for s in sentences: - print('[{score}] : {sentence}'.format(score = s.score, sentence = s.text)) - - - # --- REMAP AND COMPRESS --- - - # sorted version of the list - sorted_sentences = sorted(sentences, key=lambda s: s.score, reverse=True) - # remap sentences from 0 to 1 - max_score = sorted_sentences[0].score - min_score = sorted_sentences[-1].score - sentences = [remap_score(s, min_score, max_score) for s in sentences] - # compress scores (make more stuff invisible) - sentences = [compress_score(s) for s in sentences] - - - # -- PARSE --- - - # parse every sentences to either span or header - sentences = wiki_parse(sentences) - # add back page title - sentences = [{ 'html': title, 'text': page.title, 'score': 1 }] + sentences - - - # -- TEMPLATING --- - - # getting the template - with open(TEMPLATE_PATH, 'r') as file: - template = Template(file.read()) - # render template - html = template.render(sentences = sentences) - with open(HTML_PATH, 'w') as file: - file.write(html) diff --git a/summa/__init__.py b/summa/__init__.py index e55963f..efdc443 100644 --- a/summa/__init__.py +++ b/summa/__init__.py @@ -1,2 +1,2 @@ from summa import commons, graph, keywords, pagerank_weighted, \ - summarizer, syntactic_unit, textrank + summarizer, edits, syntactic_unit, textrank diff --git a/summa/edits.py b/summa/edits.py new file mode 100644 index 0000000..ebb7b5d --- /dev/null +++ b/summa/edits.py @@ -0,0 +1,33 @@ +from .pagerank_weighted import pagerank_weighted_scipy as _pagerank +from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences +from .commons import build_graph as _build_graph +from .commons import remove_unreachable_nodes as _remove_unreachable_nodes +from .summarizer import _set_graph_edge_weights +from .summarizer import _add_scores_to_sentences + + +def scored_sentences(text, language="english", split=False, additional_stopwords=None): + if not isinstance(text, str): + raise ValueError("Text parameter must be a Unicode object (str)!") + + # Gets a list of processed sentences. + sentences = _clean_text_by_sentences(text, language, additional_stopwords) + + # Creates the graph and calculates the similarity coefficient for every pair of nodes. + graph = _build_graph([sentence.token for sentence in sentences]) + _set_graph_edge_weights(graph) + + # Remove all nodes with all edges weights equal to zero. + _remove_unreachable_nodes(graph) + + # PageRank cannot be run in an empty graph. + if len(graph.nodes()) == 0: + return [] if split else "" + + # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score + pagerank_scores = _pagerank(graph) + + # Adds the summa scores to the sentence objects. + _add_scores_to_sentences(sentences, pagerank_scores) + + return sentences \ No newline at end of file diff --git a/template.html b/template.html deleted file mode 100644 index 504c0eb..0000000 --- a/template.html +++ /dev/null @@ -1,31 +0,0 @@ - - - - - - - - - TextRank Opacity - - - - - - - - - - - -
- - {% for s in sentences %} - {{ s.html|safe }} - {% endfor %} - -
- - - - diff --git a/texts/warehouse.txt b/texts/warehouse.txt deleted file mode 100644 index b16a42e..0000000 --- a/texts/warehouse.txt +++ /dev/null @@ -1 +0,0 @@ -A warehouse is a building for storing goods. Warehouses are used by manufacturers, importers, exporters, wholesalers, transport businesses, customs, etc. They are usually large plain buildings in industrial parks on the outskirts of cities, towns, or villages. They usually have loading docks to load and unload goods from trucks. Sometimes warehouses are designed for the loading and unloading of goods directly from railways, airports, or seaports. They often have cranes and forklifts for moving goods, which are usually placed on ISO standard pallets and then loaded into pallet racks. Stored goods can include any raw materials, packing materials, spare parts, components, or finished goods associated with agriculture, manufacturing, and production. In India and Hong Kong, a warehouse may be referred to as a "godown". There are also godowns in the Shanghai Bund. \ No newline at end of file diff --git a/www/css/main.css b/www/css/main.css deleted file mode 100644 index 46097e8..0000000 --- a/www/css/main.css +++ /dev/null @@ -1,38 +0,0 @@ - -:root{ - --lh: 1.35rem; -} - -body{ - margin: var(--lh); - line-height: var(--lh); -} - -@media print{ - body{ - margin: 0; - font-size: 10pt; - } -} - -main{ - max-width: 42rem; - margin: 0 auto; -} - -/* h1,h2,h3,h4,h5,h6{ - line-height: var(--lh); -} */ - -h1{ - text-align: center; - margin: calc(2 * var(--lh)) 0; -} - -h2,h3,h4,h5,h6{ - margin: calc(3 * var(--lh)) 0 var(--lh); -} - -:is(h1,h2,h3,h4,h5,h6) + :is(h1,h2,h3,h4,h5,h6){ - margin-top: var(--lh); -} \ No newline at end of file diff --git a/www/index.html b/www/index.html deleted file mode 100644 index 1dec090..0000000 --- a/www/index.html +++ /dev/null @@ -1,177 +0,0 @@ - - - - - - - - - TextRank Opacity - - - - - - - - - - - -
- - -

Sambucus

- - Sambucus is a genus of flowering plants in the family Adoxaceae. - - The various species are commonly called elder or elderberry. - - The genus was formerly placed in the honeysuckle family, Caprifoliaceae, but was reclassified as Adoxaceae due to genetic and morphological comparisons to plants in the genus Adoxa. - -

Description

- - The oppositely arranged leaves are pinnate with 5–9 leaflets (or, rarely, 3 or 11). - - Each leaf is 5–30 cm (2–12 in) long, and the leaflets have serrated margins. - - They bear large clusters of small white or cream-colored flowers in late spring; these are followed by clusters of small black, blue-black, or red berries (rarely yellow or white). - -

Color

- - Sambucus fruit is rich in anthocyanidins that combine to give elderberry juice an intense blue-purple coloration that turns reddish on dilution with water. - - These pigments are used as colorants in various products, and "elderberry juice color" is listed by the US FDA as allowable in certified organic food products. - - In Japan, elderberry juice is listed as an approved "natural color additive" under the Food and Sanitation Law. Fibers can be dyed with elderberry juice (using alum as a mordant) to give a light "elderberry" color. - -

Toxicity

- - Although the cooked berries (pulp and skin) of most species of Sambucus are edible, the uncooked berries and other parts of plants from this genus are poisonous. - - Leaves, twigs, branches, seeds, roots, flowers, and berries of Sambucus plants produce cyanogenic glycosides, which have toxic properties. - - Ingesting a sufficient quantity of cyanogenic glycosides from berry juice, flower tea, or beverages made from fresh leaves, branches, and fruit has been shown to cause illness, including nausea, vomiting, abdominal cramps, diarrhea, and weakness. - - In August 1983, a group of 25 people in Monterey County, California, became suddenly ill by ingesting elderberry juice pressed from fresh, uncooked Sambucus mexicana berries, leaves, and stems. - - The density of cyanogenic glycosides is higher in tea made from flowers (or leaves) than from the berries.The seeds of Sambucus callicarpa are reported to be poisonous and may cause vomiting or diarrhea. - -

Taxonomy

- - The taxonomy of the genus Sambucus L., originally described by Carl Linnaeus and hence its botanical authority, has been complicated by its wide geographical distribution and morphological diversity. - - This has led to overdescription of the species and infraspecific taxa (subspecies, varieties or forms). - - The name comes from the Greek word sambuce, an ancient wind instrument, about the removal of pith from the twigs to make whistles.Species recognized in this genus are: - -

Distribution and habitat

- - The genus occurs in temperate to subtropical regions of the world. - - More widespread in the Northern Hemisphere, its Southern Hemisphere occurrence is restricted to parts of Australasia and South America. - - Many species are widely cultivated for their ornamental leaves, flowers, and fruit. - -

Habitat

- - Elder commonly grows near farms and homesteads. - - It is a nitrogen-dependent plant and thus is generally found near places of organic waste disposal. - - Elders are often grown as a hedgerow plant in Britain since they take very fast, can be bent into shape easily, and grow quite profusely, thus having gained the reputation of being 'an instant hedge'. - - It is not generally affected by soil type or pH level and will virtually grow anywhere sufficient sunlight is available. - -

Ecology

- - In Northern California, elderberries are a food for migrating band-tailed pigeons. - - Elders are used as food plants by the larvae of some Lepidoptera species including brown-tail, buff ermine, dot moth, emperor moth, engrailed moth, swallow-tailed moth and the V-pug. - - The crushed foliage and immature fruit have a strong fetid smell. - - Valley elderberry longhorn beetles in California are very often found around red or blue elderberry bushes. - - Females lay their eggs on the bark. - - The pith of elder has been used by watchmakers for cleaning tools before intricate work. - -

Cultivation

- - Traditional uses of Sambucus involved berries, seeds, leaves, and flowers or component extracts. - - Ornamental varieties of Sambucus are grown in gardens for their showy flowers, fruits and lacy foliage which support habitat for wildlife. - - Of the many native species, three are used as ornamentals, S. - - nigra, S. - - canadensis and S. - - racemosa. - -

Uses

- -

Nutrition

- - Raw elderberries are 80% water, 18% carbohydrates, and less than 1% each of protein and fat (table). - - In a 100-gram (3+1⁄2 oz) amount, elderberries supply 305 kilojoules (73 kcal) of food energy and are a rich source of vitamin C, providing 43% of the Daily Value (DV). - - Elderberries also have moderate contents of vitamin B6 (18% DV) and iron (12% DV), with no other nutrients in significant content. - -

Dietary supplement

- - Elderberry fruit or flowers are used as dietary supplements to prevent or provide relief from minor diseases, such as flu, colds, constipation, and other conditions, served as a tea, extract or in a capsule. - - The use of elderberry supplements increased early in the COVID-19 pandemic. - - There is insufficient research to establish its effectiveness for such uses, or its safety profile. - - The raw or unripe fruit of S. - - nigra or its extracts may contain a cyanogenic glycoside that is potentially toxic. - -

Traditional medicine

- - Although practitioners of traditional medicine have used elderberry over centuries, there is no high-quality clinical evidence that such practices provide any benefit. - - The flowers of Sambucus nigra are used to produce elderflower cordial. - - St-Germain, a French liqueur, is made from elderflowers. - - Hallands Fläder, a Swedish akvavit, is flavoured with elderflowers. - - Hollowed elderberry twigs have traditionally been used as spiles to tap maple trees for syrup. - - Additionally, they have been hollowed out and used as flutes, blowguns, and syringes.The fruit of S. - - callicarpa is eaten by birds and mammals. - - It is inedible to humans when raw but can be made into wine.Elderberry twigs and fruit are employed in creating dyes for basketry. - - These stems are dyed a very deep black by soaking them in a wash made from the berry stems of the elderberry. - -

In popular culture

- - Folklore related to elder trees is extensive and can vary according to region. - - In some traditions, the elder tree is thought to ward off evil and give protection from witches, while other beliefs say that witches often congregate under the plant, especially when it is full of fruit. - - If an elder tree was cut down, a spirit known as the Elder Mother would be released and take her revenge. - - The tree could only safely be cut while chanting a rhyme to the Elder Mother.Made from the branch of an elder tree, the Elder Wand plays a pivotal role in the final book of the Harry Potter series, which was nearly named Harry Potter and the Elder Wand before author J. - - K. Rowling decided on Harry Potter and the Deathly Hallows.Elton John's 1973 album Don't Shoot Me I'm Only the Piano Player features a song titled "Elderberry Wine". - - In Monty Python and the Holy Grail, John Cleese as the French Taunter tells the knights of Camelot, "Your mother was a hamster, and your father smelt of elderberries." - -

Gallery

- - -
- - - - \ No newline at end of file