From b8343c650f7eceee6707be5dbb37a9eb16046981 Mon Sep 17 00:00:00 2001 From: Dorian Date: Sat, 15 Oct 2022 19:43:49 +0200 Subject: [PATCH] first experiment with opacity --- README.md | 4 + make.py | 233 + summa/__init__.py | 2 + summa/__pycache__/__init__.cpython-38.pyc | Bin 0 -> 342 bytes summa/__pycache__/commons.cpython-38.pyc | Bin 0 -> 807 bytes summa/__pycache__/graph.cpython-38.pyc | Bin 0 -> 7422 bytes summa/__pycache__/keywords.cpython-38.pyc | Bin 0 -> 6680 bytes .../pagerank_weighted.cpython-38.pyc | Bin 0 -> 2782 bytes summa/__pycache__/summarizer.cpython-38.pyc | Bin 0 -> 4305 bytes .../__pycache__/syntactic_unit.cpython-38.pyc | Bin 0 -> 923 bytes summa/__pycache__/textrank.cpython-38.pyc | Bin 0 -> 3471 bytes summa/commons.py | 15 + summa/exception/__init__.py | 0 summa/exception/textrank_runtime_error.py | 2 + summa/graph.py | 244 + summa/keywords.py | 227 + summa/pagerank_weighted.py | 86 + summa/preprocessing/__init__.py | 0 .../__pycache__/__init__.cpython-38.pyc | Bin 0 -> 196 bytes .../__pycache__/porter.cpython-38.pyc | Bin 0 -> 13383 bytes .../__pycache__/snowball.cpython-38.pyc | Bin 0 -> 98836 bytes .../__pycache__/stopwords.cpython-38.pyc | Bin 0 -> 19856 bytes .../__pycache__/textcleaner.cpython-38.pyc | Bin 0 -> 6821 bytes .../__pycache__/util.cpython-38.pyc | Bin 0 -> 701 bytes summa/preprocessing/porter.py | 635 +++ summa/preprocessing/snowball.py | 4291 +++++++++++++++++ summa/preprocessing/stopwords.py | 210 + summa/preprocessing/textcleaner.py | 188 + summa/preprocessing/util.py | 24 + summa/summarizer.py | 154 + summa/syntactic_unit.py | 14 + summa/textrank.py | 97 + template.html | 31 + texts/warehouse.txt | 1 + www/css/main.css | 38 + www/index.html | 177 + 36 files changed, 6673 insertions(+) create mode 100644 README.md create mode 100644 make.py create mode 100644 summa/__init__.py create mode 100644 summa/__pycache__/__init__.cpython-38.pyc create mode 100644 summa/__pycache__/commons.cpython-38.pyc create mode 100644 summa/__pycache__/graph.cpython-38.pyc create mode 100644 summa/__pycache__/keywords.cpython-38.pyc create mode 100644 summa/__pycache__/pagerank_weighted.cpython-38.pyc create mode 100644 summa/__pycache__/summarizer.cpython-38.pyc create mode 100644 summa/__pycache__/syntactic_unit.cpython-38.pyc create mode 100644 summa/__pycache__/textrank.cpython-38.pyc create mode 100644 summa/commons.py create mode 100644 summa/exception/__init__.py create mode 100644 summa/exception/textrank_runtime_error.py create mode 100644 summa/graph.py create mode 100644 summa/keywords.py create mode 100644 summa/pagerank_weighted.py create mode 100644 summa/preprocessing/__init__.py create mode 100644 summa/preprocessing/__pycache__/__init__.cpython-38.pyc create mode 100644 summa/preprocessing/__pycache__/porter.cpython-38.pyc create mode 100644 summa/preprocessing/__pycache__/snowball.cpython-38.pyc create mode 100644 summa/preprocessing/__pycache__/stopwords.cpython-38.pyc create mode 100644 summa/preprocessing/__pycache__/textcleaner.cpython-38.pyc create mode 100644 summa/preprocessing/__pycache__/util.cpython-38.pyc create mode 100644 summa/preprocessing/porter.py create mode 100644 summa/preprocessing/snowball.py create mode 100644 summa/preprocessing/stopwords.py create mode 100644 summa/preprocessing/textcleaner.py create mode 100644 summa/preprocessing/util.py create mode 100644 summa/summarizer.py create mode 100644 summa/syntactic_unit.py create mode 100644 summa/textrank.py create mode 100644 template.html create mode 100644 texts/warehouse.txt create mode 100644 www/css/main.css create mode 100644 www/index.html diff --git a/README.md b/README.md new file mode 100644 index 0000000..3b4dfe2 --- /dev/null +++ b/README.md @@ -0,0 +1,4 @@ + +opacity experiment using: +* textrank python implementation (https://github.com/summanlp/textrank) modified under `summa/` so it gives us all the sentences with their score. +* wikipedia python module (https://pypi.org/project/wikipedia/) \ No newline at end of file diff --git a/make.py b/make.py new file mode 100644 index 0000000..c1b8436 --- /dev/null +++ b/make.py @@ -0,0 +1,233 @@ +from jinja2 import Template +import os +import wikipedia +from markdown import markdown + +# importing module +import sys + +# appending a path +# sys.path.append('textrank') + +# importing required module +import summa.summarizer +from summa.summarizer import summarize + + +# TODO: +# * DONE: wiki header + +# those 3 would ask to start from the HTML itself and keep and index... +# * wiki paragraph +# * wiki hyperlinks +# * list + + +# variables +# ------------------------------------------------------------------------ + +# wikipedia_page = "forest" +# wikipedia_page = "warehouse" +# wikipedia_page = "river" +wikipedia_page = "elderflower" +# wikipedia_page = "mushroom" + +TEMPLATE_PATH = 'template.html' +HTML_PATH = 'www/index.html' + + +# utilities +# ------------------------------------------------------------------------ + +def map_value(value, min, max, new_min, new_max): + return (((value - min) / (max - min)) * (new_max - new_min)) + new_min + +def remap_score(s, min_score, max_score): + s.score = 1 - map_value(s.score, min_score, max_score, 0, 1) + return s + +def compress_score(s): + + # compress whites + s.score = s.score**3 + + # stretch + limiter + # s.score = min(map_value(s.score, 0, 1, 0, 1.5), 1) + s.score = 1 if s.score > 0.8 else s.score + + return s + + +# wikipedia +# ------------------------------------------------------------------------ + +def wikipage(pagename): + # get wikipedia page content by name of the page + + print(pagename) + wikipedia.set_lang("en") + try: + results = wikipedia.search(pagename, results=1, suggestion=False) + try: + pagename = results[0] + except IndexError: + # if there is no suggestion or search results, the page doesn't exist + raise wikipedia.PageError(pagename) + return wikipedia.WikipediaPage(pagename, redirect=True, preload=True) + except wikipedia.exceptions.DisambiguationError as e: + print(e.options) + page = '' + + return page + + +# parsing and gluing html +# ------------------------------------------------------------------------ + +def is_header(s): + + # i is the header level + i = 0 + while s.text[i] == '=' and s.text[len(s.text) - 1 - i] == '=': + i += 1 + + if i > 0: + header_text = s.text[i:(-1-i)].strip() + header_level = i + return [header_text, header_level] + +def wiki_parse(sentences): + + # TODO: doesn't work with section nesting!! + # 1. replace wikitext header with html header + # 2. add the opacity to each elements + # 3. compute an artificial score for header that is an average of the score of the section + + new_sentences = [] + + print('--- HEADERS ---') + for i in range(len(sentences)): + + s = sentences[i] + + # if sentences is header + header = is_header(s) + if header: + print(header[0]) + + # start computing the average of score of this section + current_total = 0 + current_count = 0 + next_header_found = False + j = i + 1 + + # iterating while we find next header with greatest or same level + while j < len(sentences) and not next_header_found: + + s2 = sentences[j] + s2_header = is_header(s2) + + if s2_header: + print(' ' + s2_header[0]) + if header[1] >= s2_header[1]: + # encounter header of higher level + next_header_found = True + print('X ' + s2_header[0]) + + else: + # adding every sentence to the average + current_total += s2.score + current_count += 1 + + j += 1 + + if current_count != 0: + s.score = current_total / current_count + else: + s.score = "NaN" + + s.html = ''+header[0]+'' + + # stops at the references part + if header[0] == "References" or header[0] == "See also": + break + + new_sentences.append(s) + + # not a header + else: + s.html = ''+s.text+'' + new_sentences.append(s) + + return new_sentences + + +# textrank +# ------------------------------------------------------------------------ + +def txt2rankedsentences(txt): + # from txt to ranked sentences + return summarize(txt, split=True) + + +# main +# ------------------------------------------------------------------------ + +if __name__ == '__main__': + + + # --- WIKI REQUEST --- + + # get text from wikipedia + print('--- WIKI ---') + page = wikipage(wikipedia_page) + if not page: + sys.exit("--- STOP ---") + title = '

'+page.title+'

' + text = page.content + + # print text in terminal + print('--- TXT ---') + print(text) + + + # --- APPLY TEXTRANK --- + + # apply textrank + sentences = txt2rankedsentences(text) + + # print ranked sentences in terminal + print('--- SENTENCES ---') + for s in sentences: + print('[{score}] : {sentence}'.format(score = s.score, sentence = s.text)) + + + # --- REMAP AND COMPRESS --- + + # sorted version of the list + sorted_sentences = sorted(sentences, key=lambda s: s.score, reverse=True) + # remap sentences from 0 to 1 + max_score = sorted_sentences[0].score + min_score = sorted_sentences[-1].score + sentences = [remap_score(s, min_score, max_score) for s in sentences] + # compress scores (make more stuff invisible) + sentences = [compress_score(s) for s in sentences] + + + # -- PARSE --- + + # parse every sentences to either span or header + sentences = wiki_parse(sentences) + # add back page title + sentences = [{ 'html': title, 'text': page.title, 'score': 1 }] + sentences + + + # -- TEMPLATING --- + + # getting the template + with open(TEMPLATE_PATH, 'r') as file: + template = Template(file.read()) + # render template + html = template.render(sentences = sentences) + with open(HTML_PATH, 'w') as file: + file.write(html) diff --git a/summa/__init__.py b/summa/__init__.py new file mode 100644 index 0000000..e55963f --- /dev/null +++ b/summa/__init__.py @@ -0,0 +1,2 @@ +from summa import commons, graph, keywords, pagerank_weighted, \ + summarizer, syntactic_unit, textrank diff --git a/summa/__pycache__/__init__.cpython-38.pyc b/summa/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0224f0371da1eca43d7bacd531ea5bdb50fdaec GIT binary patch literal 342 zcmYk1Jx;_h5QUw;O|;S?aSOI!eF20hs3;H;g2GKS#6x0s$Bt}AyWs{Lg`02&w^Y=0 zRM<}-67u3B>qnx#Z_E)79p9;c8oHSQ7&>SL_wt}sS*_} z#DZ#3XH4ezY+ZasH(S@C?hKL9E>yjRaR|*8=e@Dgh0_sww4k@`9#ubhlws*6BCzTe zZ8&kW_Rx9NwUbspG#(E=*>BfPsQ!$pn=KN`geqZ?P$x7I@64w6j`(@xgi-;Q#wyVK z8a(hjYhJMPj)Ml}Hcx1~E%yWN!Qx;hhoEJY&&Hzj=VmAQF`BonimlpqGtQy8AS0(8 LUS=`sJ1+YLYl~oP literal 0 HcmV?d00001 diff --git a/summa/__pycache__/commons.cpython-38.pyc b/summa/__pycache__/commons.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fbc99d9e56f2784493d5f8f1dc433267560ff5ca GIT binary patch literal 807 zcmZ`$&1%~~5T02{j$%9QrH34PC_PmatU#~E6ly5+U<#o<2!dH_HUW8SmFQw5*xw2Xoirn+kXj_Gq zzN(BY7NK3`xeVhn&&$FNmfxD*Y?Wqe+D_%&Y)d9~(8n(J(fAa5L;@4SsF(~pBDH}# z*fIqtKd{1ExW#j!P>CC#{)i23aSYdZjmlXF!!d_rM2)Y22zMus+A~QW$s2C}oKy{G z&(_F8Hds^ZEq(?n7F=u?M1WCGtGS+D>U5q|foR0wr147H-t2_or6QLZZ^<+VVeeM)fJhGQ&XpzeMwttV!VRn=n1?P1^9jC+9xc T%^&8aT4j2~mugq*JHG!1tx31l literal 0 HcmV?d00001 diff --git a/summa/__pycache__/graph.cpython-38.pyc b/summa/__pycache__/graph.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b526b791988cdeaba6140e2424aa537959bd39cc GIT binary patch literal 7422 zcmcIo%WoUU8QtxsVQ(BeHWOJ&Upc6XE7?{ZH;!W1O5(5$h!tlfF(SG2 z?9z%@3I!Us=p~1uKrclRyFh{Z8uV5aXi=b-0zLFp?4g&OjQj_^w7+j=ce#AHP+ufw zcAh)){pR<5^L_iyF6VRo^~k7@t!m{mY=TisZkBj2oQQjRq^ThFLb5aoyl1 zw;mYW5{%oQSY-<>8JN(sNd6Cz38T!oQ8qa%Tf){We$2`llx?1QV4$2uIm^dXc?{(oA6MlZ$|v{) zFFarvHICXOpV~3DEk4bQsx6PUV|+%9o$U6L zsWn=4(ZGAuTdNEb$3H4lx{BlzCY)g-X6IcsRBs|^%cj%xxNzztb645l?=Ce2uenRylQp-wbj=OirCZYbP*ekd z$!)qde{oC5>fRDi*mtFXRK9SV994)b{9q}7#1axO`Rzu-T~bgiwhqD!ZQCCw;Y?&k zu2awmFRG3sLGe){*=;45^IxaVXqT_8p^*tIYfTbbI#yII}4%IND z)}KRhP%hO`Rl=f|epDWH#9xe(OTnT!1)v>XlQ@L>f#X)I!uM-ib>VDFui@;}?h8$d zwA@4oM^R;2<;e=GEmyjYs7LXl#=*zhc1_45^C`wn@9{`m>xc5ODaQatXww^d{F4gO z@usNl>~49|KaDXO)`?Crrc2y;BxwJb2p&;pe-~CrNW~T@?zO>L!L?3_k3a|i|AFF` z=hdMm$zE1vq?nts9qeAZ@7CKmP1p;9!%_PX&8k>TCPW0B03L&A<#t`aLh}|g=PjeuJ*P5QQ;6U?;2sk~b<<=zeL-VCx z+Y`>!?_O*!IE~U_YU--CWVGb6h9vlc!l#TH)V1(b&Z~vN5FQgO`7f|@JuA| z#C=DC=jRDL*M)WtHQ%&lRGKtQ9SsfT$?QI^zYQ};z3G`TI)iMFZ<=)iBj$R)S&=74 zKrlfIRP+Qdg~Cj9v!uF@1Vk$l<+f;e_XR9~y6lt>Lq4_k6jLt9f8@dpnoK$J%H*Q1 zjO1Y=SS~rgF3QfaykQK^LaN~PiPcAd(FO66YLtw%j$l?wN&m5QW$Rg#M< zDbA9Uluc1~jItAyA?z~bGn73`S>k!?61hK;e1A&*OUQD}woR7LBaI_X>QpG2Y-YY! zS93}%97l-dR_m_sS24HmM|)dE@;^hCj7SiqAQB1Sb37a%;)$#N>lteKY!1DyZ$V+&hMlK&jCn$a~X zI%leEs@&qn9RyMvB|Dr}D$$Z&OUR%m{BT?~kg*TPiR%?roKnTa#g(Ii3b3x$tOb?I zPiT2!g<)n{7O69(P*{uy^ywb4CfW2kRJ!at=An6Lb`LxN})(Vca4YG)FbwgVKcNL z>ya66!47A4M4-%WC52+%7XcUBZo3|Y^Y5+Sxciw_Khp=Y_3-}*4|nVr>?Dk zbM2j*n-$$X%AP5m`;<0L02!=vOyl^k%vb^?X)}S>Wv6i33b4maUO?+8^P=?r7wC+a zn$}AtN8Wd4v`GI#1P#t9T1dY7v}MS${4%vn#EjVGmow<=bMYwRB7>t&gFdz>dSni> z-K>%>bIRBold_vpwQN))xzU3Y%RXWZZW~>FCzu_>}W~eCmN}TN@=)tKnH}s&jgsuBwaY{RQW|UqWmyh06~p!dEH` zKZlSHvu>+}=pmdXuQt&%?O%tpAk8q7-laHeT}n@u#(G4T^LU+JU%5&zQ}q?fde&KS zpA7h~kbD|O+<%T0kr&M(J8s5`8SiE|;<7MFLBE0mATjoGvan+unq7V3V{&09qS*@i z0U+zL6{A@Q0AH7-05dl_CpRckJP${D5FSetlP(YWRm!F*8xVyqq+cWX6iSdNHp}B* z=o5ns)sRt4q#*hQv|skne1yY{)z}Jc@_F?k*?Z!^A7$dyHThfG0g)g2I1woc*7S!K zO>78xDx7iiLEqBX)bij_-$Z&b?e-3mIx<4{gWMso{IPk@#(on}@VU}ld^KcluStK3 z$ghwr2vl2_e^0NWjbx#(oijc2;V-V#y{cRHFQX8TE?M$x7^|$Rq|d1%K_J8OJEQ>+ zh|PYVl^{71)xC@gArIAqDgBsTGd_3*ECP?9m{Vw1XMn;Z}DC&`~bTA27rxISe*K%77HBLhMQPNkXTR94B!I z`%;qBjNVmXYTPcNd!#X^PaS_kig7!oPh-UO(f8eh)=f|(C#-7%yCUvK;~|V;7nczb zvwH}!a3dPcfuMX}(qwV_=gx+$On+JPZiv=x=-sXbzW z3X{lk4a)g-wEYFiCr8GBB`mVeNqru}k`7UU4C_^zO;)#4ATrP``yNPY8v)zP%X19h zC77`Ty?Y<6cT)s5)eghlo8mxO^^sg5ZTTy$Kua0SWCb?GI`cjK?Vapsol_z583NL)lh9^`>hv6A~-xs9rZQgZ8CYLphAjE!<9X;e>f^d%H# zrHb{!W3hFqs8163Oe9NKdr&U=X7xEzaw`>aRAf`q7KMydpGc(9WPp@NWfZz>hmk#e zNSc66arOgjS&IREe~t7VIs^xFoAm$Oq20-M?XLAYXm@WBw6j3FTbg$7qvbBrz0)LI z$=rDI5By*PzJ!JJ36hq`Zy<}l!00auWb;D{*Izi+fTuzt(?)ehRFqeceSs9Y6InKA zlB|g8h@dAlL5F~&u=JFqh(tb58GRy`XDC~wYzbL;?&jK+^_v4;a(OcLl2xy~NW**K zXfIe*ajJ?~d!eifR>|s0GHa4_UXo%-qN1d8Q{tpVRKhAbUqJ7VkbIg?@vO~klTt31 zvvU^xi-r8SD&>&dxh(#fTu#!=jZ!AG+^wpli$h1)#9!4nYPm4CqQmsd6x}HeDOxk% HGcx}Kblg^l literal 0 HcmV?d00001 diff --git a/summa/__pycache__/keywords.cpython-38.pyc b/summa/__pycache__/keywords.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a2b22817a5fdf52639ef2f6d24159ebb67fd259a GIT binary patch literal 6680 zcma)AyK@`I8Nc1T1CJp16h)B|Ekh4Wf-RG_<44RWvK>ouq$rGSJ?tQskt6m`4?`|rO3EW=O+1m-vS^&$(ew|-Ea5y_P)pO zd+gpAAJ04ZeUo0Ps%ISMpEMZ!jbQLL{*!`m9OWoccN(HB1<|tZHoUS2+ErfNZ{*4& zO#AhqF-eUQXg+jlqZ=EKu?vYL64SaR9=m3$nt>=r#!0)YW$v4KBy+t zB>FiurKZszQZwoR`gt|04x(RBbLtTK!)jhFpg*Dxt0U+a)uMU^{WI#QdKUdrRaD2& zKdX+b6X=WTq^&7 zY???_l4`AW7msGD^(bssO!T>_tlh2LiM7J)l!>>ZW^E@Lm>XMb*XpXWp~KcDmdxv@ z5q}m{+D#pW)y;6N9#xvLijr?7K3bNmt7UoRN|pE<{J9vsi~nQ_jd6_F6=>zISeMtF z$6~eQW~1vm-iew?WU{HttM6XDapCgndzDLiXcrb zC+$WfT;7WAl1h@L*4=EXvJsg|D^4owwYrIPa)9I@X&s@b(N;;_y@;Z*9wMkt8?h1hU9Zt6UgkDLq6t#er8b$mQOpZd4mRBq+;`R!A9=PHlI zRxY`T(M=j9r&9N!2g-jWgo9NsKp|Im-g-A|o;F1YS6qwDX0g_+*4rv7LeygPxuv^8 zoGm8MLS6CVxLLnjjGIv@#{y;fn_;~jUDP_(nO`$eBgtG7Ze+Qz)ry)b^BE_lT$W=Z zDNlC4)KgJ;virH-M3#e^+jWz9q}R-AH*03kfSOQ|&9b6d?-iKELhcZrB(rE7kr#mo zq$j4tgpgt$tv-VJA@!3f`dBMqkRnUMjPf|J`$ z$4f=(GUcZ*Clp;#5 zCRJzSmBDbwp95l#;}Hv|kwl!VFla4(f*zivrXMXV*bY`&I<7_udG(%ke}I(0_dAf+7ai1YmEb z;}P2g#MEP@2Yb+x*`)nQ1@5my6Kkt(e?Z-5u^Tg4o@7*+nC#O+eBICWcwgDH@EyFD zkQN-`co}__qbbecM>HviLGwHaK;L=f2+<(<+k^l1qg_d6a_J>T13r=CES2aVVUGcp^XKZd#8AvRzkw%ggd8-mbK-w6 z-g~3uWnL0%qhF&<37DB%Ynqa;De&;8$ny{`GM{UbGPr1Nm`%&TPiPSyh53A0dAH`cC11wHxLR_ z@0_z;puA!$8&{L2xhT*FK|vNy`r|gqBR-QLHZ~l_^UUW5ds?>wO;xPAVRe2XxsA2S z1R6(B^~K3XB6L#-84BH0|B!T~!Gzo#k|8`ms^gFfXtr{o%=Ix>K+S{LnC2kG;&zbY zBv2AN5AsF)R**)*28vcx|)=&0C)qVAy80`Az(0{ ze*=MY59JKx^bW;igfkR`2zH!LzD8AovWei1c>pDK-~fxMzaan@Pl-bam0ME3pGrCb zkvrhs?mQQizwV6i4kNrnkOsWNAJB^wb9k5N+|~&C9QFvljtXy@L3&=?{&ahhq3Z6WgZS$ia;1m7{)7#+gKPH=94o~hqDi04jV`2n(6sd~ z4AI|=mh)CzM}8-Y0H;@tTX*;3m?jf&0La`{+{(P$ajjX(XQTaHxYpG3SXK@Y)a=oR zGOxA&c>lBHE}6Jp&f_#0X&XM?r*-{6Z9~Q!W4ZrYkK6lLnouoENVYh(S-XHE!wgV8 zD{}vlvLFfJQ&^!XnGT_cYS(_SBo@2KL80A2{YM2A>^2XiP;l`VoHyujUHseX8y6*Pf)cKcs&TU#wTv!f@A4=K%z!T!n5@R_CJNv(PP`_pKeeoyVcA2Z zKisxh5B?f^bpvLAkV&$W#QIar4;2R_8U2$VC+E?m0>y#zWO1O+@+_ADX*d1juHaP) z{%XY5#gH=CxiyHho*n)lOg>?}p_sXBfjJa_D4r1GoZiXHsnA08f^ft6=&==;N{&0A zNx>F3ls5G_G=C=I$7oP$;KbhuO~qalJgwBwq@N)MC;}2bm;p*?Twlyk^aCqYNXg;K zqJlLj``{uTd&mBjb+)vF{jk;kSNX{%qNK~+nlX46k-RNtaCdqoU? zy<0;Jf96kEXj6bc2l&h|h_u4&zrOk-{r>*eYBqv$1mR%wQk81G-jDVprEs|Y3kb3b z?%QNCW2C)!LU=?FO>^mpk`KI2U_6{Z5N;80M@`O*qyf`;6Qp^B%26tue>-YM)BG-l z+kBdTIQ9tZ6$RpY>gml5N;4?nOo7&P#>}`~la9eD<*wH$mIq zoq#h>V<*{}G&2}Y!d-D50+9XHou%u@xkau6BDi3&(MIZDi;7|KMzdDMm18kpqZ_{B z8HCQ#G2}I0;YzZa&EoE(M#sFcjw}_oIOlWr(;q?~r2|>MmeiVw3E}yfi?{U*=CcEp zCoe_yH9XOD?#K$2ftygidD2wuXA8jW)2~E1BxgQfk0Sc^ZezJ32wI>$T<4Ki?P zD)aa@@;&q;LDbH>6OvaCoxliw{W5R`gWd^Di3cuS z{_tfF!1#bJgOKlW7)JI7?E{E(6!r*msShB?)soOu-Q%pgg>KIvx=us70c1Pid5h?6 zYBV*&I@u>Bws0 zxSnW=Li#i{beQFPoou?V|K82JCWYu2O?jcEqweJsGX4@7n}HW;O`5Y}QpIF(hYZzd z(9xK@+zLHPRD&9-`}8<9RG;(htyKwKnDLd3J=)T}(B8cPpLF@;%&OZDNUk3!6LZ4i zuw$`8S{cwIqlV*GaIO)nc0D>zIUymFlNJdt{3kshdC(2KU;_X7AQuFI&n@+%!5HTK LU=dHo^T+-RR{Y?? literal 0 HcmV?d00001 diff --git a/summa/__pycache__/pagerank_weighted.cpython-38.pyc b/summa/__pycache__/pagerank_weighted.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e18dc20e1c74880802619e2c9f041e68684c9e0b GIT binary patch literal 2782 zcmZ`*O>Y~=8J?NlC70BPX@#{#+jW(|Xv-E6gC8W5|U5w+HG zNzD#zD;NR+WFX2R=wA?zy7Uhe=&3)Ur#-d>+8~#nd+PHJMcT63UCcY5JM(@#?=$<^ z(o&t_`IG%!@>jI}PMudD51rql6kk9{CV9sCJmQ?%A`^Wt@?6`?{C*Hs(DtLMyeg|Q ze8HnouE?6Kqg9g&a`6R=>T*dgqh63#WCQi03?8%g8{cDh*5+kBDRj3V8=XE!&r@j+ zRiOGqbJTl#^VSc4d;iA0?SH=Oo*(Wdq}}WxJ8CF2R{RUXa>);bUX{YI6L!ic?3mwV zV{XK$H|<+K5o1q!W8e7FKjvHP%Ri0->&xIbZ0uvaGF^x9ug6sxSh32+A;tn*ozUv7 zZ@H;_!@l8rLRMt;g_}F!G{v6RJ1RN4g3tKK%eF_A|9-5OfFXh4R|&D9vOymQUgY&ObWms&aKY zKGcKVcsI=uwb@Jv7}GIDxs2VD{qDRUAeh4={U(kom-o^PfuGN{9x9}5-n*=BR5ie5 zUMNjOS6;!9i7JXNz%$BB(XKh1X-e6s@w%ST%4H4p&#}~L5nY)hV@ko150oMWo+{9? zLy;2>Mc3zRrx~wgw@A|A(OcN9ptNH^Q0Rxe4lFI>eM_ux-NE<;o}s)U_&L`8Ga5ZG zG>Sk0Z=SLVm>4($JC6e+Y~Wb>>v6^Uw$ftXGUJ`|FAxR(<){7$oAYmV{>1~}BET#d z9bbkmMhxZ{8XSAz=^a~>o~!w3)mA3B;efoV4PDDat7fY=u`@Vt=kSi>Y+!Q_uMKPf z{s(%2xM2H&dH;__VHgy7jYc&&LQ^KSgzcown8zkr7r`_`LA8_^TnF7w;L z)O>P8x#<2r&eK`b#*vR~&D#Ey(Q7j8BOk%oQRO^s7GBA+*YZaH3>$oiQjh}}Y+8hM zb>YS@uqxfYWYgT96j|}oCj5Qe0^iZU3)$wiz^N^;F8()s{4X>tp%M1brQY_+ngZL< z&7!8^y+Oml6h`Z4>mQRKzn8uP*u`|$Z)4^Yl!7oyp=t7K!i~})q-&c`tEFhs5y5&f zj5TPA05TiO(mc+3j=A(NFs{i!{XU7CB-Tg}N4W4hpSUw#apD~FWw_BF$Wf;5l8FV4 R!zoszcu!o0bY84JaN1YDhGtHim42G{Ze0JGRd*K#aZjo^+l!va zq_<15A-b=e7tXPB*Lie*#HRAbh!?EjkNL=@anDsNSYw5I z%=C(}Ye#(K6`o+@HMejJ@0c@Ze`P-~-d~)@jI91VzSsLgYvHyyxpxO4a+9A+vHMKcQOwT(3I z#;M+1o4(ktm%epZ8SKQ`K3$%PdU#zY{UnW5VxE;jEJU^B#%Wsmw5Kl5MqRkv$mk$W zlL;VvwE!(T%?V}Ra4DD!H#j%hWaL}fwQ{ZSi`oT{8SL>1fxhsnnRgLFFL6rCAztb0L7EuCLO#q4 z`hGvpq9?f$T9HQT9drO0*Ei5^mdkc&N+?mD$=Bx0lf1Juvs1Nqrv~zh^o{X&^Q4go z9c?sChIe3|fHygTWr2BI(dwLuCJ}E<1X_2D1EYW?(8CzFA>p8G3&BoDC|!gUj$RjzU!i!TqIM3-a*s!WC5)N=;}GVLqr7|Gj&b4vopeb;RyfL z%rVhVZEFthlaoB4jVTy<*zi$ngmdg-%U$-E|Bdf^4=R$oa7h-TW?@1~5;z|H0GW}KH}`5PBy!pXArw4f z`yin42*aw&cmW>vO}+=lA*I@G*_ilNmo@8jtvJN2f9j0*{zBnW_B|}ZUnU1K3*(TG7Uqw`=?Omyl{7<@={GHgPZF~e z#?26y$xWPw#73c|u5g@`un;2i&aMnpevk+$*6fQF^ZAw)DxeYoN;lqydE>!AW&-Sg zgXVloc4d=>F^Mf}nIiF1sWk-r zUceT;OV(dV5JWd~WJFOsB_r~0;q1a^L^L2E@2{w+3XZBLP#;&USg;lAP2nJgB1QZe z>G6RB&*8Zv{-CO?{CQ%c6hX}fJRyYs1R(t(#5SwY$x5bA9yHiUqYP) zpm-A=lK3ABPkq^g`GpU^HRzd9!!(O}(KsS+*KJ}sT>Bh`4hOM{`_f1i_J;_tZHZ|5 zB1^ic>%)ATD%bD>ZB+Z}_u^g<{B<1gt?c_0i&KI>e9u+T;Tp^A>%!Y%b4C_8imjN&GdYwiu9& zs4jG<5Ox>bW%PmDVhiY-uKGPz<`|l=<)Ym|)07wpU#N6AmBHgOgpaZ0Rxu7#5Z}7< zQBCCrZS-AfTz!g%-sTfUY{Q;$r(kri1|5!;p*2TCG2 z;JX;m>#b2)Wnm_jqGV{bF6&h#s}(U|bq0!i^&9HwG`4)V1*!_^mP!JPDa%flJ*+#e q(ldvg)}Cds(a*&&m7B!2_VbmhUi)){7Jg>}F9?E0a4uMEUi&|miOf>~ literal 0 HcmV?d00001 diff --git a/summa/__pycache__/syntactic_unit.cpython-38.pyc b/summa/__pycache__/syntactic_unit.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a76bdaef585b055a5bcc23b7486a039a82a723a4 GIT binary patch literal 923 zcmZ`%OK%e~5cXp?*@gzF1dWh-$cbo?$Vwa$LX|*5;;4>XH#Y-}ZR7Dmjp>2$&1?GxaUqmjV#^pNZVqP`BVK>T76p0ot8#HcnqgIpCP=JW|F!2Y$h|e z9$w60`r+9{NQ5ZC6#@r$%`MRue7m5F-3_va>(TS@_jg<&FVA2>q2sk z_YYmGl9c6n+Crv{oM|CKM~GRa=2P^yh4?a;(^bZ9Bh(`_JDZ?5mc>CW@{Q{7p(L!+ gPI$AHr!2IpBYYc8e5d2v%^`WS0)C4p`~oF^0O&%_@c;k- literal 0 HcmV?d00001 diff --git a/summa/__pycache__/textrank.cpython-38.pyc b/summa/__pycache__/textrank.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f2f48c92868eb1bebf6ed2e3d4f602bf5aea166d GIT binary patch literal 3471 zcmaJ^&2JmW6`$E%l1qw`CB@%$oK4cEW-C(i!6;xDvW-Ya6BM!|DX_!9Al94_xzci% zo*l~4hPqTP(2EeDxAr8Vr~C&2dhMnE!kh~9lv~faG`}|#8A)xr#Lm9?cr)|9e(&+~ z>FEl?^S9xr?Qdon`xkZ2J{~%c@MeERBbeYZYmUCW$!*_hI`-=}UEv5f_Iib8fiuuv zTC?>g=SfnWLa>*w>(i4rMJ9U5xywDWz8v35mWe9MO94WJ1u6!C46V( zB{93>sJC+FkYO*r!D@5g;WljTJC3%-O*tF%dZCJPiH^S`_ur&SWSzhKnVx?hKk_!1cFy+&!@`v~N$V zxyIjEUE5q;`*>Bk*jJJ2Y6atIxu=`e$rW3#_Dw;B;5P-!spv}IFhz#={>#Ol6j8V+ zQWb^C;!3E)#b+wrmTjFahDjJ@jaO2&cNTS5z6q7=rh`m|i9mNNRWj3ybesy4oke@% zq}%B48-F}Ei=#5q0^>fPbyO8po0K_z91wILtEFck@&(O?Q2wFw_4NZje*r0b*RbN@zfp7j}Z^Y?vR~u5`IWxdutu=+UJz`_rv_=R|i($ zC<$P>jtstBYAh`TcN;vI4MsB}7aMO+ovbXix#b#dbe zxM4YNil2#Fx+vZ~v&(Jqo(RPI;sbj+Kb|5TL%ScYSy#?qu2KxNb%3x4dTFKuC1bf8 zCOXLAAu?&pEJ#%_hKeBF3c{ch?aCye7zqX$e8O^XqPvc83c|R9nAF`~1EGFtWCAT! zs~v|~W<2k|aXtNl=S7j?QwPM6o#2cuIdY6G1H zlZEyIQPLkk5L|-}F7>1icS8kOftwqzE8~7%sn^Gnw(5&X@13$i9;85KdGJNCD?>6>SA$Y4#DJ6o)~SL9;PX3bK|bv)C@n3tufQ+&xef zH~FLUTSEj={UC|XA8~(Trr^#}jVwh-8Qi&Bs}c zQmWT2zT$MHrfDR6LLn3+gC4*WTQ1yv2@PNbd{F{lAU}y^($QVI5A{Yy*lz@%=wD`V z4%oM!Dl6;h3V1^&LJqLDsd+!%OMQSdCs(l;kE^`Niyw~YaBXxCm^zMnB>GREzIkS9 zqUorU0P95nLlCRg&1ffU2HnW7*6aT(N#Pn{J{JS>7SgSSN$$$kDjXY{2?GKTcs@jI zr^uz!V!GG=|=5t&<(Ty!v}biyHf2#>O|I@k;2GgwW>uo zGc$(pXN2g=ygTl%jLe(;7T*5Ef*&H!+=7U$-arq+INoo`y>^VS-IcA5N(X&YrCDSL zn-eUI8#hX{qO@nc=TFyHEE1aOmDMMYKmXlkYyI)&Cr?eqW)ot}5N;lQP0@}iQ?NT< zfL!$}Z1@r0YzYnH^G?;NaOXe8QR`NTb}!qJ8Qa>C(^<>iCdTnYxn+XJ|3pVK%`FzQ zKR7pRrV;D_wHJ_>ZM$lL+5<;>2RvB=T~N!^Vx)+X2cS!;ul?h)aCGIw0jjd?Da%^| zuA_F_@l~#?$J4^4ER|F&?GcSLkiE8N)Zg``?b+iwqEEQ`R?ic4$iKdPfSe$TZ`sgW zp_{qSPYZ$$u=(SyKddf zxY<536kK_CRGCr;*z#1}0o5#>Vx93(Q6>m~EGEj#kMjT!{`qK1m56szi`d%LI4LDS z0D>vndMZ=+E5Q`SgDLa@DB7lFMK@RE+d%PO)1=o+g&b{3YBYT7qU;{Y_YiX#2}IH5 zanC98v%f0!9G@fZFk3{`R^>Mlh3XMFjNfX}9<5ewc69Ii7*+SE`3=oiCh1oFShri0 zu-Z{IT1f@6#Sn`bR;!cUsk&^n&Rm=-NM*}Q{{aLX#B!OEUq*?`M-_&jQ{`2cB=Ow& OlJEN^|Azm*&wl_&d!|$X literal 0 HcmV?d00001 diff --git a/summa/commons.py b/summa/commons.py new file mode 100644 index 0000000..ac7f939 --- /dev/null +++ b/summa/commons.py @@ -0,0 +1,15 @@ +from .graph import Graph + + +def build_graph(sequence): + graph = Graph() + for item in sequence: + if not graph.has_node(item): + graph.add_node(item) + return graph + + +def remove_unreachable_nodes(graph): + for node in graph.nodes(): + if sum(graph.edge_weight((node, other)) for other in graph.neighbors(node)) == 0: + graph.del_node(node) diff --git a/summa/exception/__init__.py b/summa/exception/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/summa/exception/textrank_runtime_error.py b/summa/exception/textrank_runtime_error.py new file mode 100644 index 0000000..eee2584 --- /dev/null +++ b/summa/exception/textrank_runtime_error.py @@ -0,0 +1,2 @@ +class TextrankRuntimeError(RuntimeError): + pass \ No newline at end of file diff --git a/summa/graph.py b/summa/graph.py new file mode 100644 index 0000000..48075ec --- /dev/null +++ b/summa/graph.py @@ -0,0 +1,244 @@ +from abc import ABCMeta, abstractmethod + + +class IGraph(metaclass=ABCMeta): + """ + Represents the interface or contract that the graph for TextRank should implement + """ + + @abstractmethod + def nodes(self): + """ + Return node list. + + @rtype: list + @return: Node list. + """ + pass + + + @abstractmethod + def edges(self): + """ + Return all edges in the graph. + + @rtype: list + @return: List of all edges in the graph. + """ + pass + + @abstractmethod + def neighbors(self, node): + """ + Return all nodes that are directly accessible from given node. + + @type node: node + @param node: Node identifier + + @rtype: list + @return: List of nodes directly accessible from given node. + """ + pass + + + @abstractmethod + def has_node(self, node): + """ + Return whether the requested node exists. + + @type node: node + @param node: Node identifier + + @rtype: boolean + @return: Truth-value for node existence. + """ + pass + + + @abstractmethod + def add_node(self, node, attrs=None): + """ + Add given node to the graph. + + @attention: While nodes can be of any type, it's strongly recommended to use only + numbers and single-line strings as node identifiers if you intend to use write(). + + @type node: node + @param node: Node identifier. + + @type attrs: list + @param attrs: List of node attributes specified as (attribute, value) tuples. + """ + pass + + + @abstractmethod + def add_edge(self, edge, wt=1, label='', attrs=[]): + """ + Add an edge to the graph connecting two nodes. + + An edge, here, is a pair of nodes like C{(n, m)}. + + @type edge: tuple + @param edge: Edge. + + @type wt: number + @param wt: Edge weight. + + @type label: string + @param label: Edge label. + + @type attrs: list + @param attrs: List of node attributes specified as (attribute, value) tuples. + """ + pass + + + @abstractmethod + def has_edge(self, edge): + """ + Return whether an edge exists. + + @type edge: tuple + @param edge: Edge. + + @rtype: boolean + @return: Truth-value for edge existence. + """ + pass + + + @abstractmethod + def edge_weight(self, edge): + """ + Get the weight of an edge. + + @type edge: edge + @param edge: One edge. + + @rtype: number + @return: Edge weight. + """ + pass + + + @abstractmethod + def del_node(self, node): + """ + Remove a node from the graph. + + @type node: node + @param node: Node identifier. + """ + pass + + +class Graph(IGraph): + """ + Implementation of an undirected graph, based on Pygraph + """ + + WEIGHT_ATTRIBUTE_NAME = "weight" + DEFAULT_WEIGHT = 0 + + LABEL_ATTRIBUTE_NAME = "label" + DEFAULT_LABEL = "" + + def __init__(self): + # Metadata about edges + self.edge_properties = {} # Mapping: Edge -> Dict mapping, lablel-> str, wt->num + self.edge_attr = {} # Key value pairs: (Edge -> Attributes) + # Metadata about nodes + self.node_attr = {} # Pairing: Node -> Attributes + self.node_neighbors = {} # Pairing: Node -> Neighbors + + def has_edge(self, edge): + u,v = edge + return (u,v) in self.edge_properties and (v,u) in self.edge_properties + + def edge_weight(self, edge): + return self.get_edge_properties( edge ).setdefault( self.WEIGHT_ATTRIBUTE_NAME, self.DEFAULT_WEIGHT ) + + def neighbors(self, node): + return self.node_neighbors[node] + + def has_node(self, node): + return node in self.node_neighbors + + def add_edge(self, edge, wt=1, label='', attrs=[]): + u, v = edge + if (v not in self.node_neighbors[u] and u not in self.node_neighbors[v]): + self.node_neighbors[u].append(v) + if (u != v): + self.node_neighbors[v].append(u) + + self.add_edge_attributes((u,v), attrs) + self.set_edge_properties((u, v), label=label, weight=wt) + else: + raise ValueError("Edge (%s, %s) already in graph" % (u, v)) + + def add_node(self, node, attrs=None): + if attrs is None: + attrs = [] + if (not node in self.node_neighbors): + self.node_neighbors[node] = [] + self.node_attr[node] = attrs + else: + raise ValueError("Node %s already in graph" % node) + + def nodes(self): + return list(self.node_neighbors.keys()) + + def edges(self): + return [ a for a in list(self.edge_properties.keys()) ] + + def del_node(self, node): + for each in list(self.neighbors(node)): + if (each != node): + self.del_edge((each, node)) + del(self.node_neighbors[node]) + del(self.node_attr[node]) + + # Helper methods + def get_edge_properties(self, edge): + return self.edge_properties.setdefault( edge, {} ) + + def add_edge_attributes(self, edge, attrs): + for attr in attrs: + self.add_edge_attribute(edge, attr) + + def add_edge_attribute(self, edge, attr): + self.edge_attr[edge] = self.edge_attributes(edge) + [attr] + + if (edge[0] != edge[1]): + self.edge_attr[(edge[1],edge[0])] = self.edge_attributes((edge[1], edge[0])) + [attr] + + def edge_attributes(self, edge): + try: + return self.edge_attr[edge] + except KeyError: + return [] + + def set_edge_properties(self, edge, **properties ): + self.edge_properties.setdefault( edge, {} ).update( properties ) + if (edge[0] != edge[1]): + self.edge_properties.setdefault((edge[1], edge[0]), {}).update( properties ) + + def del_edge(self, edge): + u, v = edge + self.node_neighbors[u].remove(v) + self.del_edge_labeling((u, v)) + if (u != v): + self.node_neighbors[v].remove(u) + self.del_edge_labeling((v, u)) # TODO: This is redundant + + def del_edge_labeling( self, edge ): + keys = [edge] + keys.append(edge[::-1]) + + for key in keys: + for mapping in [self.edge_properties, self.edge_attr ]: + try: + del ( mapping[key] ) + except KeyError: + pass \ No newline at end of file diff --git a/summa/keywords.py b/summa/keywords.py new file mode 100644 index 0000000..8505770 --- /dev/null +++ b/summa/keywords.py @@ -0,0 +1,227 @@ +from itertools import combinations as _combinations +from queue import Queue + +from .pagerank_weighted import pagerank_weighted_scipy as _pagerank +from .preprocessing.textcleaner import clean_text_by_word as _clean_text_by_word +from .preprocessing.textcleaner import tokenize_by_word as _tokenize_by_word +from .commons import build_graph as _build_graph +from .commons import remove_unreachable_nodes as _remove_unreachable_nodes + +WINDOW_SIZE = 2 + +"""Check tags in http://www.clips.ua.ac.be/pages/mbsp-tags and use only first two letters +Example: filter for nouns and adjectives: +INCLUDING_FILTER = ['NN', 'JJ']""" +INCLUDING_FILTER = ['NN', 'JJ'] +EXCLUDING_FILTER = [] + + +def _get_pos_filters(): + return frozenset(INCLUDING_FILTER), frozenset(EXCLUDING_FILTER) + + +def _get_words_for_graph(tokens): + include_filters, exclude_filters = _get_pos_filters() + if include_filters and exclude_filters: + raise ValueError("Can't use both include and exclude filters, should use only one") + + result = [] + for word, unit in tokens.items(): + if exclude_filters and unit.tag in exclude_filters: + continue + if (include_filters and unit.tag in include_filters) or not include_filters or not unit.tag: + result.append(unit.token) + return result + + +def _get_first_window(split_text): + return split_text[:WINDOW_SIZE] + + +def _set_graph_edge(graph, tokens, word_a, word_b): + if word_a in tokens and word_b in tokens: + lemma_a = tokens[word_a].token + lemma_b = tokens[word_b].token + edge = (lemma_a, lemma_b) + + if graph.has_node(lemma_a) and graph.has_node(lemma_b) and not graph.has_edge(edge): + graph.add_edge(edge) + + +def _process_first_window(graph, tokens, split_text): + first_window = _get_first_window(split_text) + for word_a, word_b in _combinations(first_window, 2): + _set_graph_edge(graph, tokens, word_a, word_b) + + +def _init_queue(split_text): + queue = Queue() + first_window = _get_first_window(split_text) + for word in first_window[1:]: + queue.put(word) + return queue + + +def _process_word(graph, tokens, queue, word): + for word_to_compare in _queue_iterator(queue): + _set_graph_edge(graph, tokens, word, word_to_compare) + + +def _update_queue(queue, word): + queue.get() + queue.put(word) + assert queue.qsize() == (WINDOW_SIZE - 1) + + +def _process_text(graph, tokens, split_text): + queue = _init_queue(split_text) + for i in range(WINDOW_SIZE, len(split_text)): + word = split_text[i] + _process_word(graph, tokens, queue, word) + _update_queue(queue, word) + + +def _queue_iterator(queue): + iterations = queue.qsize() + for i in range(iterations): + var = queue.get() + yield var + queue.put(var) + + +def _set_graph_edges(graph, tokens, split_text): + _process_first_window(graph, tokens, split_text) + _process_text(graph, tokens, split_text) + + +def _extract_tokens(lemmas, scores, ratio, words): + lemmas.sort(key=lambda s: scores[s], reverse=True) + + # If no "words" option is selected, the number of sentences is + # reduced by the provided ratio, else, the ratio is ignored. + length = len(lemmas) * ratio if words is None else words + return [(scores[lemmas[i]], lemmas[i],) for i in range(int(length))] + + +def _lemmas_to_words(tokens): + lemma_to_word = {} + for word, unit in tokens.items(): + lemma = unit.token + if lemma in lemma_to_word: + lemma_to_word[lemma].append(word) + else: + lemma_to_word[lemma] = [word] + return lemma_to_word + + +def _get_keywords_with_score(extracted_lemmas, lemma_to_word): + """ + :param extracted_lemmas:list of tuples + :param lemma_to_word: dict of {lemma:list of words} + :return: dict of {keyword:score} + """ + keywords = {} + for score, lemma in extracted_lemmas: + keyword_list = lemma_to_word[lemma] + for keyword in keyword_list: + keywords[keyword] = score + return keywords + + +def _strip_word(word): + stripped_word_list = list(_tokenize_by_word(word)) + return stripped_word_list[0] if stripped_word_list else "" + + +def _get_combined_keywords(_keywords, split_text): + """ + :param keywords:dict of keywords:scores + :param split_text: list of strings + :return: combined_keywords:list + """ + result = [] + _keywords = _keywords.copy() + len_text = len(split_text) + for i in range(len_text): + word = _strip_word(split_text[i]) + if word in _keywords: + combined_word = [word] + if i + 1 == len_text: + result.append(word) # appends last word if keyword and doesn't iterate + for j in range(i + 1, len_text): + other_word = _strip_word(split_text[j]) + if other_word in _keywords and other_word == split_text[j] \ + and other_word not in combined_word: + combined_word.append(other_word) + else: + for keyword in combined_word: + _keywords.pop(keyword) + result.append(" ".join(combined_word)) + break + return result + + +def _get_average_score(concept, _keywords): + word_list = concept.split() + word_counter = 0 + total = 0 + for word in word_list: + total += _keywords[word] + word_counter += 1 + return total / word_counter + + +def _format_results(_keywords, combined_keywords, split, scores): + """ + :param keywords:dict of keywords:scores + :param combined_keywords:list of word/s + """ + combined_keywords.sort(key=lambda w: _get_average_score(w, _keywords), reverse=True) + if scores: + return [(word, _get_average_score(word, _keywords)) for word in combined_keywords] + if split: + return combined_keywords + return "\n".join(combined_keywords) + + +def keywords(text, ratio=0.2, words=None, language="english", split=False, scores=False, deaccent=False, additional_stopwords=None): + if not isinstance(text, str): + raise ValueError("Text parameter must be a Unicode object (str)!") + + # Gets a dict of word -> lemma + tokens = _clean_text_by_word(text, language, deacc=deaccent, additional_stopwords=additional_stopwords) + split_text = list(_tokenize_by_word(text)) + + # Creates the graph and adds the edges + graph = _build_graph(_get_words_for_graph(tokens)) + _set_graph_edges(graph, tokens, split_text) + del split_text # It's no longer used + + _remove_unreachable_nodes(graph) + + # PageRank cannot be run in an empty graph. + if len(graph.nodes()) == 0: + return [] if split else "" + + # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score + pagerank_scores = _pagerank(graph) + + extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) + + lemmas_to_word = _lemmas_to_words(tokens) + keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) + + # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined + combined_keywords = _get_combined_keywords(keywords, text.split()) + + return _format_results(keywords, combined_keywords, split, scores) + + +def get_graph(text, language="english", deaccent=False): + tokens = _clean_text_by_word(text, language, deacc=deaccent) + split_text = list(_tokenize_by_word(text, deacc=deaccent)) + + graph = _build_graph(_get_words_for_graph(tokens)) + _set_graph_edges(graph, tokens, split_text) + + return graph diff --git a/summa/pagerank_weighted.py b/summa/pagerank_weighted.py new file mode 100644 index 0000000..e0bb90c --- /dev/null +++ b/summa/pagerank_weighted.py @@ -0,0 +1,86 @@ +from scipy.sparse import csr_matrix +from scipy.linalg import eig +from numpy import empty as empty_matrix + +CONVERGENCE_THRESHOLD = 0.0001 + + +def pagerank_weighted(graph, initial_value=None, damping=0.85): + """Calculates PageRank for an undirected graph""" + if initial_value == None: initial_value = 1.0 / len(graph.nodes()) + scores = dict.fromkeys(graph.nodes(), initial_value) + + iteration_quantity = 0 + for iteration_number in range(100): + iteration_quantity += 1 + convergence_achieved = 0 + for i in graph.nodes(): + rank = 1 - damping + for j in graph.neighbors(i): + neighbors_sum = sum(graph.edge_weight((j, k)) for k in graph.neighbors(j)) + rank += damping * scores[j] * graph.edge_weight((j, i)) / neighbors_sum + + if abs(scores[i] - rank) <= CONVERGENCE_THRESHOLD: + convergence_achieved += 1 + + scores[i] = rank + + if convergence_achieved == len(graph.nodes()): + break + + return scores + + +def pagerank_weighted_scipy(graph, damping=0.85): + adjacency_matrix = build_adjacency_matrix(graph) + probability_matrix = build_probability_matrix(graph) + + # Suppress deprecation warnings from numpy. + # See https://github.com/summanlp/textrank/issues/57 + import warnings + with warnings.catch_warnings(): + from numpy import VisibleDeprecationWarning + warnings.filterwarnings("ignore", category=VisibleDeprecationWarning) + warnings.filterwarnings("ignore", category=PendingDeprecationWarning) + pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * probability_matrix + + vals, vecs = eig(pagerank_matrix, left=True, right=False) + return process_results(graph, vecs) + + +def build_adjacency_matrix(graph): + row = [] + col = [] + data = [] + nodes = graph.nodes() + length = len(nodes) + + for i in range(length): + current_node = nodes[i] + neighbors_sum = sum(graph.edge_weight((current_node, neighbor)) for neighbor in graph.neighbors(current_node)) + for j in range(length): + edge_weight = float(graph.edge_weight((current_node, nodes[j]))) + if i != j and edge_weight != 0: + row.append(i) + col.append(j) + data.append(edge_weight / neighbors_sum) + + return csr_matrix((data,(row,col)), shape=(length,length)) + + +def build_probability_matrix(graph): + dimension = len(graph.nodes()) + matrix = empty_matrix((dimension,dimension)) + + probability = 1 / float(dimension) + matrix.fill(probability) + + return matrix + + +def process_results(graph, vecs): + scores = {} + for i, node in enumerate(graph.nodes()): + scores[node] = abs(vecs[i][0]) + + return scores diff --git a/summa/preprocessing/__init__.py b/summa/preprocessing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/summa/preprocessing/__pycache__/__init__.cpython-38.pyc b/summa/preprocessing/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ccc443ca5a3ed79ef6745a7b2b1126122e94769 GIT binary patch literal 196 zcmYk0F$w}P6hyONA;KQSX8#35EUYX<5bN+wz8cpg%l}za@Is!!o7j2;J1dh)2j&ek z)oki|q_Q5jg(>+e_*a$7EL#Q{R;|^zTHY0x`G=1KPu3yW$PNR{F(Irt9?2vD4CtiU z5}Ov%fzHTuh`kU7o9iwzNrFU|#28wT-g|_OspDu!#DzU*?Se~MH{JC)ocL(_6HUv7 HceeTh)=V}v literal 0 HcmV?d00001 diff --git a/summa/preprocessing/__pycache__/porter.cpython-38.pyc b/summa/preprocessing/__pycache__/porter.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8abbbde8cb3e080daec97e050b01ac7837f9c0b2 GIT binary patch literal 13383 zcma)CTWlQHd7j%YSF068-7m47V_G)3GIvQzaxRKYSh1|wPD;2ET|06VX1Hg#9PRE5 zd1jV03Ca(#8w9C~x^7SeMi59q9%`TuLGzXtMbU>o6h#muK~M~eplE?21d5^viU3K| z%lH3hc6MjEOuOWqIp@sz|8x8A=e&C4NZG*e&v##LJob6R_-85%{%jPU!xNoGB8@x7 zXAEgd>m5T{Z(DavY2P-gj{Dt$VU%A9RqQEoJN7yqPnF9{Yknl~AFp{L{T9(TKWGZK z-3*lK-B8rv2zK${0 zC&eo&Y$~@S=C9N*i|P4KeqyeARm@+WtIc2jc)eT>gZ8$N{_3iyXihwCKNfDxV_OgX zAdW=GRV{R1tKE3vmFu^bUcL1~ttQ-nY8GF3tyU|qco;XZ-NYDN(Fv6&R$VO3YPT(V zD-f;@u3|R584-D2xh(Koi{tgHv$I=UTlLsg`0qybP&H@YLjSQJ%<2tD_hz)opvUz3bgrY+<}S zLfT-ADn!AJXo_Y1YsS*z@tS@R z5ItaIF3nEzg1E|?5#A;ezkvkXJR`c1Ct9Ajo}mUM2Ut2kB1}Ar1}(80Y=a4cC=A>n z27;i`?n>%nYeu}u#I#-4OXR&7djT`emdi-el>!BVWu9MC!1e;T6R7mU)<9;UV*IeN zRG^NP2kMY|Q@AqlqNvwkhB~D2)oA@)f+kSjjA2mSLgS`9ulq^}VgW0I7}v!m8aOsg zYN7R;iAc(_q(2iSm zfzH}Ag0&$$*!-R#U{;v~^M7vXv$fhJV2IBMJd=0KaM-9Gei9;(C3y5U5}6XXkf5h_ zh#`UMNu+>o+)dYSyDMN#7Dv}oxxmAV!mes~kabP39(eIWxdEA;`(@JpeN=k}UxZ_Ai7Vs{}q8!7!C`7vv-I(Ra){6Y??nQPdohLOzc7aXBfU!25*!2sF); z$&s8ObJtBM9-`js%xNrWK;B}-jj&b_542VXDj!Wn1fI6}f2#)Wli_}|il8Rf#VW`a zvIhb_PhM;8rU!xH1=wIsV|u7l5lz5va4(s(2dUili1ffGypyaLwRrAR3#l?vA`I&X zD0Y~b)439{rR$L@1IWvbt`DIJ%n)N>Xb^h~Qbh3ZNFPGd#M|p$RA;0Otam0oimUhzi;nayT(0Z*S=IE9RFIcw132rp0uW6m=#U3%k&iO;qSiQfjy?$3fOYrX=<~HHJEMh?2vU(0ZwHN{&)8LCG;BiRmV$mzaKH zhKbp&+KICshVA4KSa}B<;>AgE6aBZNs*^a8*IrGI`bv4tZrfGMw9`>yx4iAd-gMg^ z8tQT2{8e+d<4ND0m9V$mVD^R^yR)!s?s<(knso!$kLoL)YP4oSS6h%%Yaz4}gb9kk z2t_y>L#aR#w`QYmr{m7Tq~hsE5f*E9osGWw`gT%UUiJe&US95^+voxk<5baf(!Zif ze~!)NpBIl6&5~I$$E}^yni^ALtoM*KNEG#cl!t|>iR?LKG+_`yw2JDuG;gOh(rQ`i z+Ahcs?~Bqwz45@L*0ONccxY8XB2-%~?o5+Ho%XAwUzcvZ`hvLj?1ET%!@tW4x36yN zHxqMv4}BU-dqntEOFe>DQdlMgRa-quC5QC9I)f~)x=tpkIECaPo`^Qwu*R_-70lhS zj+;9thUQKfa}%dY6NfQ(eSnNK@7s8sU1!&7IqKvt>~BnwrtdtoqREY!T}N7HjQh?; zsh3OJK9_H+7B^n+wb;1Q%k4ta1KLhu;yg_B<&AfHEjQM(+?CV|xQX@YOjkw6bEB>n zD9~)(4(N-dRA#}zYTf91K?5RZ#f!H{{M10PADT=;)dCZmnnqn>-b>726tyMRa_3Lc zBD#pgI1Fa9Kx!a3%PgBEXCQBu?0#uyVu%ns4H9F$Uyl7~0I?B+sousPWb;d5d6|!vmP6B+qbELxe%b4faTH=+aGyz5+x}lZ2=j=mqF; z8i9=U_Z)12fXsgte3S5SZmH6c`a zUsC~F*OEt?Bv`|lv}8Y|HPSN4dT6OTy}YzZ1V{a{!{smPH6=|Cxns*h%ZlE@yO_Sm z(Ce3}jVxhJ54j&g%D_n*65V}XcB^8#=UijS#k0?fc`X@e$={f}%a-CB^DOAK0Ldg1 z`DJ}fD^=ap7d3I|?o7WhpB3)HNYXI}QosGp6Sa(&Av<(0wSmuMt-IuyK+Lg0=V1&| zOFVKq*|CEq2fwc84yK}sWFj=Q6kBVMo3SE`hP)F})f78$1A#<`AV6N)3)NmY&A%^W zfX}Np2Tb6!gNC~~?Nw*s;p+EVxX$H<@;jVa>9RlF8xOrBxWqGJC46%RBXYOGX}}G zvygH-$A`EZ06DJC$*}K(guuV%IYhBQUBRyTz>xMsQ(YjgmWAIXS~P=~xxR?{-_i0; z5-q;SZA{%}1F!=SXHhCJR-3TmOWIDB$eKkjhk)(d1K9p}uvJjWVSAq;%7FMWcnI)4 zIN}?582bN*Ip`H`UwLT0XR0C~>^`=%ZeXnP$bBoAV~BD7`mqe+D9tUcBM2_vHwj|b z0`9+Ms0nHjeTnnO2J*)l@*es=@o`$kF9EN!YjQ1S#H)U?HKy_OS19;IZjPsTj;C6> zKjT3mn>~L?p}B`$=$q* z{{{@;D1z#zkcQBuK}KE=19%lg@${WagI^Jl(jWONN`oKqjo|Pj(0)lwr-8t$6wHG` zglJ%O5Kxbz2e^MsFmHH7y)nmq0S2H|#6SpEf2GnOZnvq@?=&bb4Stq>M4<#ApwDS3 zhaSEq`AjK#lm@}c@#Jp|j*@mtc{Bo6qBjVNQV!N{w+9fTl>3TZ8u*G*epDKDx{cIP z2COcTj4>nlPuS;0b(xAb_HBE{6)fQ*#-vHbUoe zSp1ciQ@Z{oV58@d7^f>R=*IEDhjPY|RxC%CtZ}CdkvMJ()D}%j9jj5jkXy2{W*-XEGk*Kr!iL9aiCJ9tgO1 zrU2ge5uZg*T3-yD0i;1|v0Yo}GcJ19QT`B-an`uoG$ObV4UY~b5L9*hY8@&N!En;w zJ++QOW~A4{hU0eQ;$ zQ`~*loe)!Aa4$?-q*;o%_p$I|(E^JO#Vp~BQ8=3d`*m>>_Kc?x+)c667C3_w02-FA z>>#>6RqK0OY16t6?as6RhUg?l7s0&63dSKqv0)f|iWe$9HNXG{!(tqU%4QLSFht-8 zU|1fpGW%?Ka=tYb(t7BQj zwh-G!i`uKX0!~xUQ1W<|Od^+zX)(~a2J0m%qQ*EeV@_B*r-n=)&09HE&RIVh5cuoV4rM|;>&M=;D`rn zfbpE;bv)l66ITBt4Ec{a$ZQn=ne^B4KH{Hb?LFk3yln(nBVc$07%X)>hv8A`8@1zo z=-y5M;xUG}F%sgrkq{qch|L@X&kud5kr09*{v-#{8`Zb>LtuBT;C-y><5*Q&NuJpx z%`pU?e&N9Sb zjfD6MhR{1Rm)p;IhWbSgie(K}?dJ@2X_&7t+M5{dWsJ6dT77&_=L<~*cvr|htFjY ztaz3Ro`9QzLsqU#(E*&lKxx;>5}?DM{z)_2(dqHb3J2?138wcLI$fM$Q=CyyNOnQY z^^GO%j+5o9PO)O)KHc^n#mm_4UG#rR=B^FJmK60N?j9XtBxWOV5V`gDC^oeBYidMg zFL7Mda_X^VoH&@0(i-kF1Zl6deYHRxJf9W`2u6NVpd)C1k2G3RfJK2S+oO7%L;&V2 z)sLmC1zUW_OA0*Tz5nFyU4%Wf30`p9e$+{Dyc5y!&Zeg`K(8~v#~CvQhBhufBn5wy zUN+h$#TB3G{9h4Pvza(_-vQwXj<>x;11C#9DY6#6^q~$%>fqZa>1$qFD}}lx($rr= z#wd^#UP7>dUa&E9O&!Pp>8FF4B{hsm(x3s|ZN) z45(hWOUriA-Z?qKTDWwuvG4(=046}FgpC8T{U3mBbhu;ztVfnVLK83!IGm@hqBV|( z2Nfe)k3pfgN3>q!)-0?uL8OBuAB<>C!IB}|VCY|fq01Q8Kk&E?qo<#Yfc@b}*zW;W z8y>GPGz<@@*0VWyE$mGkwovbRF>6Q;gQ%Bt5XV@_n(F4rK1l!_hX69w``qUV?(^Lo zhF+gvWeg_=SF&VHC5s1E{vTjV7r} z4X)GdPTsL)2}w=V@hg zh|NL50J9qy%7eQN2%_k}IRN6X_d~Eb`$$IDr@2>6*W)=X*~meO<&k5zgC7IWIMeDE z8P8Fs)lp6X#t%HV4#4v+@Juis;!+&;GoHLzoUH@!kXgKcC3I-97YrR*JwW|nKNMEO z4#v?t8TS^}k$NZN9>=)3)zAS)nuP`ar`}Hw?E{(e*3j;2Q(i;q{U0{s&NEcrA?nZ6 zVwY(DOv(7S)L$Mr!td=LAsfmUFw;jcQ(KWdGv%X%C;A!Z$+)+N+=2eAZFULv&zcN; z8%F3L#r7%3RyTMg!&GkuA7CVcNAeBs{c*syvo|VJzejI4bWR4$cUCHUbM9?6L^W@$}AB68%@8Ibyd3 zTQyu=h4o4I3u|Q2vBsdgh7>oUD1tjv- ztCX;Av0xJFLmt18>J17o!&pSBo<=*^oIyNbCl)*kGO;$&3do@{3`A6tWq8M-+m4At zMrvZ=S`-IkZPA4c0&!#aTC%OTGqiZzOhLjbb_-i9CB0?|p&bB~~IIV?}dTT#6U~o{MYYpNX+xa{Z z-!mXSj@g0y-0Tu81y8lllAhb*QZOQb@QwIt8eV9T1<}%$1Ktfq_?g@^mC~`}Vq&%1 zu;n=x&s$Alc(z0aWAW?`jkZOenMOD~8V8i&RLqmGLMrAYqV6L%;2G^iHDFa_KXRC; zOx{oQEhISNHd`fi8p}*pHT3j-C%BN-UhGHHaVU=aOQ^Tc7^IQ-@{e)P#LcHg+W#s*bL)7~+9e6)O`zFxNC`_0|uwMl+(=xm( zFmDm>>~H5x-ZRQDh}w~_AX9?v3W?b#aD5w z%A`EbMV*babjND2ghjU&6FO5W;0zFwvIg|LGS%Y1A^9xT{vndYR$i>=hRQI>R+e3P z5B~ph1J_`0XM~G`AG!fDfopfTcsG0>W&mWL{GeGg;XZ|o=JE&TL)`R9T|WyZtPRP* zQOI4~`oLW5fo%rs{Q-@WhOPJdkC4l;9#1Rp1}LhFY52@Y=6Q4Yn4k72cO4b7*`S99 zOHgzf3Cp^3^jdCr7kJ64re4>4w{(+{?l>+l?@-@(fOFranY0P613roGi`A0^7PuX6 zc{wRBFYCM8$XAw^H@a>+{W7+Ui#ZM8e*!>!i$}|({Q2N-W&*vGw1B)o>g$y2Pl0oj zR7VT(B|H(8N{(}^G*dcUnktQzN~JTU=SycxUno6MdbM<<{8;JnQjyy)VnW|Qb1sa+ Rnv|~Txh)WR^6EI3{})54L52VT literal 0 HcmV?d00001 diff --git a/summa/preprocessing/__pycache__/snowball.cpython-38.pyc b/summa/preprocessing/__pycache__/snowball.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf37702aaf33a342d28017a6986c8e9e0e61a870 GIT binary patch literal 98836 zcmeFa51d_9buT{m+;jiToyibF5<&>!5<-%Jn2>;gDOwan@L{9`t>zhqn>ja`o4IrD z%s%%{$owYM615gBrfMy z`E&2wNhY9v9=~5Fx$Eq|Yp?z1?6db;Yp?yP1q(_&`1|Da%ZDF(i_`NhdeVP6crL>) zn9lU{cs<*Dyo~46Guty>)^qF5Say3h(*t*|?vCZQ=f?8e^O>HP^>}%&@P;0*aKPDK zfUoHF!q*F55x$bw2VY;cSj`@Aw`UR3?=3*cf@<+Vc6%@U1KvsSpCtYg{0qI4;Xk?B zi}*hH7kQ_^e@e9ke?R<-y(RE35&r`CmwKndf2#Ng;9usQ2LEZ|KMDTj-s$k4F8+n^ zukg-*{|qVbWcXKlXTpD`#4m!s?41SwSyKKf@UQaDhW~7dUkv|h?;QBg5&shS*LZ8; zUn}^Q!oSWNgnv-tPlf+n?>zX=6MW0yU++B+{^v>h)8OCWZG?YgwS*9N`*P3O)-!ni z)UMJtzp-~`rCu)w&Fa`#)dZ!|^&_>QJl619^=f(CG$w0aH7Hlg;|aN;N zL#8@juMAhifuOZ32 zk!p*poSElLk)&5t z{8})wzI;WiIgH0k{oQr(Us>}t;!0EX8S}EL8LRl~CDTy;4NR>tklJ^wPE&F zcZ-uv)7sst2G#ZD>l(-ZGA8AB~h zL{k|Z>NSR2V^ts2mq)6mx<#`vn|8-WVO7rTJ_7v8ZN|*ssw0oJ0_*| zhcQY)eAI30foQnWLMo-dJZ!2J)K195XjS6I(!E&Ti-y7YMQiw#G4`|3Tu3C@{OVpU zleyxS3^tG)^uN3oeEP^aR%zB6XsyXgtzOw#uWGr?@<_8ezGdUaz`j=-ns^r~rWI^x znB5zbezH{~&YC~j7)8x0K{*&{?DbJG3_sgJFM837%DYTs41;fMtg-=b3Tif7#dttp zM1_=VV;q=q+e9OpElC99rsg--ttzkDu%}V;*F{ZelWmAc`QTvr?DDp1wY+zvHas%d z=$D#q*swvKL?6}Vg`EVfTeVw;{wm_Ve5BDpZ&hNno%uHq&$?AFZZ#V_D@bNkju$SQRVI&Ys~6HFu`!nU6H zeJOOf;@fT(GNTmbV7xk9+f^Hm2rqjzQyp&B_dS!|*J+0Bg?YTQamz2X(~yl{H`tk? zOY=*T&IaEyUg29G)nSWFJCP3~hn*a>b`IAo0me3mchIb$75Q@NnAy_YH=Z=p7F6Dp z@hZ0b+Hk|u9@%0lHQ;%5rQWK()R>0ZQodR~J^C8@jkv}55=+->Q`!lDBfraEMCN*o zs025ul=3WngA2nv>bSEu9OgZlY{Q(#Nj@x8b&?MYyL6Hd^Sfk{5A!Kbqjf{CI;OiLk`?RVVteh}T;t`mlg`f)jmM2y~K%LudG5e)FQgEFAn@j$iO1 zfa%P1&z_#qf?3+kc$w+U(w+mETQW`O0Adbh5BGS^sXeFl?02Ra(lg{DwYw(=SI+Cf zS2DYG>g?-N1Gl{PoM0^*g)d6{(c2If?AXC)wKrQ7Zs z2OW~3OUGaq&uV>FSd37F{WgINnIk>sEL7=eZQ~feZIz84K4%qw;}w-=W#hG`u?Me5 zu(9G}cx>2NHN&GD(fNDvbslNpJ;yf@UJQw9(A?Op-qbV|e{`cvog3Sx&SZWb-xn5# zhH8GTIW+V-HmIy#Go?&_W>Kb?nOfF4TSkeuq8X5rUy`ArurxHJOD4GchlVCvmAZ{7 z3=QGiJ2YgL0lPVyfE7%ERrF$6(!=#kKu6JW&vQ?auwi)H{^He78NuzVA1{lQkL$T! zPF6s9^lc$rHgt_$#cZ>)jqS#%ugUu@%e|3meY|Rt_qMTX7uJIEVvRR2CU@5AwdOua zk2zAmSmnyx^OwHpf_l~roSrn>r^7j?U!CeNmoGdPK6XwSuFka{)#8V_;Eev*Y+3v+ z$1hk1(8Q~7AhRcP$T{4zA1}?Jtd}{QF{`Jsob;SGI(T8=lGnqqV`~5T^wUodx}nS2 z%dAF7n8SCvYQkdG_X4gh1|72!;pR+&q?4t8&qkP!pY(5!Gk_5`b=F*6vmqK}ZB<}( z;wlWl<7J{sEG41N)GMWjIPqP}Pky=4YU<>OlzZtp!HZC>L6jzSK9bQYw%3j?0+A05HCaWrbx-b?)zj2?hXkuRhxpeJtR-M$ zndcEC<%mm~lD+5Q7riRQV%{)p7{9hZ(s3Dn0bjlDuL_ziXV|EfE782td@q*8M*M-< zSqZQ>!%~l58B9jIYks93Ejnf9YFjO)TJLALuzqoQ7uKp+-Qw$qd53EV>C^NYd#vJB zxuQ`r)&^zNq&6&7jMugsun)p|k5vmAErU0TpWx`V(iu&1`1R(ftPnQv%Y~_e-`z1x zw0IZe)v&Gm+=G_=(Cj&c&(+~frl;vdF1}|+GFu0;(vRBx>=d(}AQ=PF{Tv)`#xHOI zdiWkK)hgO~Eq-~nqHFn{b{$;_(qv$<$Z7lpb1~!g$;cQ+NZa2$hgG&Wg zK4HYr+2k3JAf)4|Dnf=)&=EEjt$OD-J&J{g7-nrk! zg2O$nXWGG%V-ek+vkVIi`158FQ+ip3o!Y}$^gzaQM|;e8m+;)EYsQfpn|7IRv;O8T zsoc?=dAF8wDi}jHWpO&=^kgt@GCdP_Pb`|w;!Vb?4V97hl`QruqYZi-qbNWI9{ZDJ0#ti@B=#aDm$uh71_S7$)oxs+poM)|T-t{6OLT(gU?B2}x2J_E$-~R6 z6-{W4QS6lUQMuN{tBx-Lmgn5))TRLZ(S`gq> zkrUHWwA5$4b-Ay3yDar3iS8++!2&Ho-BFvQzERP#Q7ed*UtBX{O8sFy=yCpwX<~~; z%^tlxOs@n+g{QY65^H1_ZE4RfFQ?**zn-k3zLJ`c3xzFh+ZqYSsv5!RWtV;5(S8vx z08F$FQQQLPvMeGk z+M=G7aFD_t!#Q#5qnt_%zN0-yoG^#A_wH(#tyg`_!(11IE?X$fYfFcY*%W4K9kYA5 zaL8;L3R3gtZy{%}7@%i~izWGr%=wvt%(BcPrV>fi-nCnaM0$@!MkbDU? zBH}mj(=UPJ<_$xJ@oW2wzXW9jcVA$%t;H7(ZNpyJs~FEV^VH4K4afKzH1{@aw+G!m z2#T#diT4!~Tp-x&x{f?MS@d!03ot`~rMMY8U(7Vuy}H}v&xxPkRGIv5hl7(pes0GH z;KKbmOvgF<0a!|R>H}c?$yALA`kINQ7Dv{8M2I6BRQFw;DL~ z923vzvy>p=6oILy?X#2+di)Ou2d3=sPm3Q~hVm48IB?fl1B{1HWWX6BM! z8Dvq4SRfJya4OrToH84hbcK>IkFS_bSl*)FY94v>2dL+l}Ze$1)h&FgXk zEAnbGYC8$Xr;AMFU5yr~!^-NI0s-=bd6Ym>3@;0fQ#mJ;)q_MVQK4b(g`VfL)u* zRH^ff#U3L{Epn4>N(c6sv<>A~BE8uQX8XMGG)HRqwki>+Pv-HbdD{ffmDyeTIG!2i zH>;-_9{nB3B~m+=|9kcaR`eWCq_Zd6csSWch=kE`mM(K|=VaT7;}|;#neHCta@x*$ zo8ScB@j`}N0B}UN5GJ1gF!fD#o%;HR%w#8$F z=qXYuqtQD@}}t9lkjTWYS@XcOZ9*TOMv+nxOpB0 zrqOjT@Phdlc)>ghOruK`c)X<=r(zv`={X2ZHYN%oag&zY7He3rS;4uT^p3f7i;)By zJrNRb6#P(lN>*}@_v4+eRBMgazL^iqynp5s;o_ac-mcvvdq(SH{`drVhm(76nmV#L zbaw6xo#Elo@x0L4wJUUX?+%@jkZ^Ft3g=30>e}8s=k4h0QR_va$#U zFa$sd7XffO$nQY7<{-UOZHAd@mNIn(fWIex*R3J3gg|*=Bwk?2bO+fu$ z31~i?0WP*j*A~e19L!AYoOl?N^n#MjT=%0bUCA|qcHnNIFzN!wNE(Oay90RVfzT-bM~+CHkImF(rHhjKGM?uP>koPR5SAI*kMYHj_@}$v5QK+y*UFIo0i@+NqAj{-a$y0zWlO5Kc5Bdea0IhStoCc~veI*qaJ(CUdkz~Rcy za1fFDx|s8rdD*N)_Jb$Os_raL7QgM74$ve&qh$w6>ayx+6)|G$s^f0F zgEzoysC!}p04y1*G6z9_$=zA0-~(N+?HLWTJK@Af7~f>9iOsJfY1_xMfB+)ak0)dW z(Z|!v;FFm-gb$;aJq(dP#kum#d3kwa=ORz&?8_6o7kT!1{qkI3pU}}4|3W@RCCBkj z#%J#J-XbXHoPy5;wDQR)+T|S1?9Vh^Z*dcn2rE&z5+Az^K5~mi6y38Yj}P>13^@&Y z{`hp?Ml8UwWLiYX#K+WNJ}FLo=J#hNZcDHdlL)+D(*^S*7IQB!i<_8aVZd@9=_rdb zk=bQEGTn2ifEdtv*m<3^-`$@}O1GkRsguQ9z{hf@VoK`BFt!5gMH<*X__UrniVv{) zT$h+-^3nYqa-6C)-|zaXx~Ah|;rof09UtW<_)*uz#lg&uL;Lg7?xEi4ynNK3n$CNt zVS(Y8dQ^LFIjQkZ*Zz`F*IUvS>Q3hl^?IiPL)P3bDRLbtatAa1df;9$nn#*77-=?r z05|B)bBUGe^@65~V@_Sx6S;+&Ge)zT!dsFK$9My6Z6TnoBLwY?(pM_AY;*5S;?%s> zExD{a=NnL77Rqm*liFD^-dRE;!aK^J7mZj#Whd@LPd&xn!Mm3^=o*YbZ&f^w;P=kv zTYLN(#5GnN$E~sS=jl@Bmw|sd`hPWQ;~M&wc;}?rOWoRD>dpzpYvSB%Qxpp}Z&lof z5k?8IJ)myIsP3f4qV&h(GxmL+11nZ>_BdN2|JX zQu9I3o->yq4QR)LJ8OMQl=A%+m0E2$om z@NO*{^}Cjk^m|8%b5e34$~aHT_))qHe+9zVBb;~?rrHAIEP>Tl!kFxN!fJjkNvIlFhb@^P|AXYL9!6?Glf$A`>bgsc0AK0YGMQFZ@;KJE!K z4Rw87AD;>{Ep^=+X7&jWHq5+MIj~{oYT1s6I)^MF9{sWGRz|8d51iR#J(1l_t!{G- zC@?Q0xSrqu0Fu3pZti%(%pP+Kp4W$&;YXhYCRvO;`V+X2>>W(@P6Dnw%x@ALBzQZ( zqm(vr>tCsW6;KVGMk91ut-(IoC_!u9o*OGc++~`gi5t2+=^~tC-RvwGN-vG0eU!Nr zn8PzV;@a1+x|hO+i$jOQNcLe&lUjBT783DXxPXz|NDcF#jxwpa`!T;j@D_rzVi0e6 z-h}VHWP6xC&jkpL7-WmNHARIa--n<17VS__X7! zaTd9YGOIHeV(YXV8>mv2+ajdHzy8c}Y?_KsyUx<5-0aGqxJb!=Ou=mwQXy_J;xxRJ zU4c!D#5$U$Kg-P({y8f$14!E*N1txZmIE6xE)%Spp0O-r0E~EU&;~&9A~5F5*~QtZ zlhaLY_iDwid$m7eGyXBbJp^-Zz-~kAud{ur+>h2hNj6|u6T&-d<3(NP%kT@{1u*x< ztCJxQW_)RTzt^`N%UN#$51jIlDG!!%*@_8j0Tx_1Hp-)-JRXXFD|svw|H?cLihrx@ zzq394w;IPl`R|-N)@>iqBcR()^44K$8w>~Rc|$uQEAItkE8gtcxD(m#MOjXC!_R#rIo&L!q`-qe~Fta z^C+?n;T&bA_qoRtNX%#ByeW;adzlG`>A2j1=aNVw)sq*_zauZ4M?a>~b#Kz=&%eOu z&!fOJx-JFs6j7x<#6=WdcIeC0aCijozS$RP6jSX`qT}UP#1&0F^^B<;32s04BZne! z*mqxtbgjKymMZ9vtG7d8ke!aw^x&QQ*|24*dPcCFF~f@ERH2B127Ap*@oCY;(p7X_ zO|X^Vl?2xkTt~pA6b{pQ)!iebW4>t)W~Z`{J5D)kgQJ^VZM%Z^%~_vO+FM;4oBOAG5Bz741vJHxZ0%nfE+#ugoXht3U@N3 zHU)u?TWK~3nI_BNY-4u~@~;}2?Jrof&$3O38D|S~Joh3G!HmKuA$&q$J#>ZKED9T3 z1nPM9T}q>H%7>K!m#4OoR-nXYp{qj<7yyaS5_6V#v!EX?pw6<;EJiyPqjl3HB~EP^ z3s|nd!%RKQ2BSzHjP47wUJXg>1WkgPR}XW$*-D|?f~f$|R57e|!>5M_x`pVykdHyt z43(-hafUX`4>y9b1_F21;LAbp0^OYF@l$_yn8yh(J_`_JN@&xwa61{2 zF*jgu3tK0azW||`S48N|UhkotS4!e64>a%PwPfKJUP+z~1c7!TM*%*j1`kxq)BW_sTfIK$efRBmz25yf{2;6Y75(6Bh_DXs1>z<(1D4X9 zsN0t8;rtn+sF9NT$C~aC0UTF?g zsc3QQe(+Eam8Ml7`Fhq4EmBAcUUMBG;2Cy=Xh{#Cq}Ky$pYVT{wc-8pbirE)&WFR^ z^$Om}G5;rW3sxJR_a8v|Mc`*R=2kh<)J79{BYALP9plXl7lp|M@03)D>edp~o%e4* ziHn6o@S`INA$nc0zc^i#b|*E5ye0ebZXR?d{uIAM(^-UR`s3i2JU>JWeHpm=(L&CS z*QZ*@F;AfUrNZgF8=NrhkDM{lh)eg0cd8w|I5}`2qcPwWWx+Y-`0VH}n>y#qctmP& zd{cQ2NuG`#wR!TACl3ztXW1rc8ai4gYwktMV8R0L^egz?JMDrGXF7A?J?)?l=NLN5 zmiMLPEfX&09$Wv@EDv1kr@kEMA)jmjW5Eqx10Pz$TaK29+6^3a$;TRC^rKaK_VC?v z&3ooGCLRQ}kWe7~k8D}Q4j*!#62FbH-#Z=erut5{v3cuX5vR@t>n&DLX|X~z!nW2K zaXxLme3CC73$`t6ea}GcbLQ2y=aZH}FMArNV<>J5^ighN8l(4#xO5kMLmWh8oxGL8 zMWipPLxHijAMAVNotYkicpKT~-?lWgr)e8^k}_`wBqjP?-%5+kjyCks0N#(>j;m7d zM;`M=^j7piZK43F06C~5(vQcPO%A&L50T?6soB>%YSP}CtV!N{w4)}rRnu>ft;tt9 zYN9El_r~?7Q0^*FDhEBISw-IhJJ+?%d!Y0x)!*4&H+V5b5U1{*33cC(XJF}>h2U3&4#WNX&zE^+I~o} zpdM84)(SkG|6jAn2G)HAbo}2pGs9XD4 z-T0FDHzLP*s6%hEs%Yzl^@;?lEHldeG=6mSnb1_?Pj(x{pVfe^>0cz6+b+~wiG51{HzX<6s1b)tU8Bf0; zCrN(#qji&fQM=70nCJT@Jjj%mVMbjjD~>g(8MV*siD%RW=2n!&8MV*cJ0INmo^plu zeB|gi*GS8C_D%O$q2Da*+Be6V75dE;UHc}P6|lwzMQpE&gd!>y7Q7XycTnB>4yv0u z`jO*ewCDo+Me3}>C0)l?zj?509W-_K@wLD}d!x&U*F4eqng%^D5PGg<4bt<${8mo> z7GT(dHeiqD?JAOPw#P4|+C$yi9_r>Ae!;{$F=qB)yqt{n-R-H-vH&IoB76flcvSE% zl~TA8;(PEyP|fgpw7Y-NGpifu8<@BdGzul&W~}1|%lAqp)fc~q{>`LVbR!1L zq)=wp=)T91Z(~BVGFOo=!dG;_go+O>fW6gpmpFXm~#OhrLys(P?hR2pCY&g0P+8e?ms1< zkW@Hz*CTj_p8E(s3IP001uD)vq4!;z68@6NW5ev;n_g=^#AGM|?5BONRS6^6Vi3bV z$e6z)_%gxYFw`-BPxq$@{*3`y<)nR{0be2DIT?ihE!__ge3jte3I3Acg8)GD1-d^& z@UI&1S9E`o;0GG;*K~&fEe1qUe40X)6 z=>8_b*9qQF@IwG-ur&>hlMF6WRoM#}xN^ucd96fjXvi{>l~Zm02eW^S;Nt`rFwd#8 z@~HSr<-%vqilv+;k>sdkxZE{WO864aks=Y5gsW!Zm?c&VyHk@$Q=qfj*#NPaYmUq* zW!~=C6VU^huPfPX&nLKqZMTSxfoe*N4asp=8@4kYCg)TW$v64Mv@X(30{#vMLf2&g zQ2K?$*rif0B-SDaSJ}4g-JkRjvAbRx}Oq%v>hi6qV|tSjMDeEDFhaC&x7bvA#iFszB)G>)ITKXB@PSx@Ktj&q$ z%d;-{M+vG4Qi2x(%<+CCf-rL-lg_DM@Cl6B zi#dAtzybY&GwsNvns>&{VU%W}$??rHR2or0j%!EAQJJxaRunoa%(P?ssKu?ZDBq!nRBF`lJgpNo? zXnz{I7c$AuU%dQZrxw{@$#)f2qhcE!@v&2+NItB`Rf2dHDgyI4CTRytys(Fy8ddn& zNXB=m(cIaR@m2{_l|8X=*UUXFO4-Ca10QtjJ_5h2im?{1nLnWK4*^&sy3f<94#cy%>J<3Er7QZiHI#bqdT;0Zj#!=9R zV;mq9aU_sYj<0Ti<}O3sHuGBmVaCTi@4|vO1@L32JvS|IAOrkk5;_)zjU=qRrhGiRrk!?)h0saK@$GhRl&y~OvOf1PGQPHlGI5|9Er%{+@PMud?yV? z;RGSiJwkX0OCt`L=TI1ji1UxI$xTinW@9(45Jusye~l>ML&SpP0yzpu!2-8=p|C9B zmcalFZ6SiEErbTC?wwc-+VWsbWp*4*QFk9jL|`uHNlVyCgy@xGwtIxi-lPmq-Abjp zaURnD0P>e4|K0P+f!1@qJ`u>?==JkJNmOTVfwF(tZZ^d&k|J&)n*tQr6rfU4#O-EN zkgfsHH6V07i84-7>LC;bBPyVB{>e5(#h&x3t$#0Yo&ubXA;T!bxfnQ|fXtW(=Msj@ zuLb>Gz`0a#zCDHE)DyrUr+4FXy>V;d7*O0UA}WLsb>cpZ0i3mkIP}S641m`ALB%qZ znKgI7r3Yi4U>z|YmV>%%Fpluk>5^jr3||q4pCRGvZ2FaP_?Z%ZnGGkyQ}LW7;jgjb z6xe9^*>L5AB})9OV_)=E@K{phZmXSl&VKZYzY;XAK`mUee4{tOzCEOh^-RLoTD5~- zX=}uEyhl=6ghk#1DYGraWywB-;}iE|%)JNYuR}SmsRP@fcM@}<#Y;&E8V2?vYa;Qs zpVZxsZ!7SfEA_b#_<98wlr(G`(WWWH|FJ*ootJuB)vc|hZpD2RIM+)a9;k`>QCmG~ z+hew~{|R7tp7agX7v@zesaIR5I5vSk4#-lsA+~)Y3XW3;uw0*UKZz*ze)bj0gi^qc zgLnrpmvELwn|>DcqWB{(0)vzFjq=WygCEyPP1WwxoUJ3Q*7mu0USkM!@vRouoK0IgLndV}oDH^^X-Dz3iq#l=BX%P)z6h^gt!w&!Sm> zusE@hEOGQA#gtH!VhYYhag7HgKu zL=`7KN*cs2!6j2)h*!+pgcTm|hM&8Ki_A;0DBY^OWik<3v|a)e!D_z*D{-@xE7{Db zFtW`nWm!8KRFx}Uh1Xn5V420c>8=rwb35po*U^0g!Honv34RFxtlkm2_t5Q{9dr*7 zR0wEIT-Vmnm!f+Y0L&^tTSS{g-S!q&ilJN9du?g2>j`EYu zvvTog6ETZ>KKpzYE1~DJE@tN@N3&#roW(E!tLP%*?-#LM3^-=xe3r9FOi&bYs!BPe zOD(@S4^A#kQ7B)NoL9#4-%{|5`Qy2(Me&I#@>sz1B$bNL^;xzdl_yv+x^%j67~a#r&WxyXwxE$S=BzKC8J)Ici5 z;5uu>r6xBPkxMRI+Y&h9N@2vaT?PMuvzXKg4_xz08gOy(#A!A_$8vPxk`xN%+!;&d zIk9qP4N93hrF~RcKJP$RKJTyc-FYLyn+Sf5fM>nTEd&P$-aznXf;l<3FF|ZSdw3~2 z7#!T?!nvh%Aq(!U14A9IVVYq?xBbQ3Rtlkx&u#UJ;I8&s^KH1c5L{N6Z)>xZT6qXK z6DF(NFLXjYR?TsuEY|j^;<#+7pAx6<1P(oIp3kAD&1r<)d2gE&!P{|bxp_W^p4t$0 zFEg34j$7oR2OdF3cMxUY`w2xLfuhfRW;CE$~@N%EMIh0V-I(Z1G2I$a_W zC~jatJE88>jX*_>a$;vb+B8#`dQoARgG1R)ttlet!8J5xXKRYv=CnRlTQ39%{fwwi zJx(_8%-u>YGO-GLfJ|f{?Nz0n4+JV!j1L_HQWIASfC)Pi4ag2NKuw0}%$=y3bxGBj zljn_S9*?9*H_jYEtzqM?g4&_XEXe?9n#X`b8OcYg_8HjYMJPZx%y&W=1O+>l!2AwU z3kP?wAVt<#F>zf@mnuo11laSr8lrYt{N9c{ZEV;+1jHv3CixYqoleB^_MpbUtgk#9~y2?!(?vi&0xaEH}ZXN*Up-6LHyI&=!hLn)u@>T*{VR zq=<0hv!K5R`PAahXKcQ_`OKWH zveCYW=c85f=5^lrQnS3dLRw{`zS*-{#r8hRYH!!Pd5>+^ym`}n+SR8hB zR%ORt#7one-n-iM{sAQo!lIl4u_nacW|-^eSA^qPABt&T@Vosi^y3G!#C~e$R3+ z9$SL8KU3T~J4M$yBu6T|=L>6C@#bU%XjDh3}A>LC^d?jHj80(|@X&|gK9 z%!2YgjOJc3zspqArtdY6MQ?-sCS0uVkN*VPmYUO-fVJOiIKr6c1(HX`8+==EB2MGt zZ!TJK0b23iRKN9NcF=wkm-h>T<0d=)dSTD2*V_nem!f^><4h8LWk%3$)IRUU*-ZQG zN{`gIPM0Aa&nxtrPe^{d0>GP5g6=oNzmD)FwvN9Ths1QFReL8M!w)kV+Wj)Ldr4UG zaf)PBCM&W%ZEScKEm1P(b2WEy&;C5ldBFuyPKl=^m09n-81;ZZsz=09_nxmQI@f`q zsDzVzB{J0y;Y=NM&G9UFSR3I9gF3P9`AcRS0#>HUb64~HN>MBiGZW?z*YoECYz;H7 zQf=u|m9Be50I2Bnsh4 zARmTwVLQU=#Sg?JXae=KL={1Z%ps9O{34s~Ed*!9U=>|tUtbA!LokWH5jliv7{f-} zip-k)(#$&W+1Eg$e6{fI*ML($pz;HRlmAay19|N$;9Cs03k`O$--eWw0YGK|*J5zu z36~=cZN)LI!pL46IsZv0Eyd9fEqbvR$C4neKp20(>kt#m)Nn05p+VEIWTuUt2@cYz zaZ5!5of3)_R<=Q@L>~G2MYe&u<)wyzlvq&CP(thQ#z_3!6{=GXIw++PiG>51i?b3T z)aGYu$}5<=b)*8lpo#M`f9!L1owQ(WvTYoQLUh=f#N!g1ImRG*bqR8GGgUVIvyqdQ%T7@ckBPK*{d9Lz~tV&z` zVHgXzpm2_B$48nQ29vIsb>o)pH0hi6i+Sblv7g?>Tecvp5-+zX;1XUPK%h4+D6Skt z4Cy~MPCWiPZ(UHV?b`0lyd}DDp)Gf}OSV$&kV4*Z8AC*mwH-FEyLd5$&+`J__KOcs z=q6oNrQoL;=l`-8X<-(_3#!4DajFpL?O1XSke8)XVM6G?{f|ucb>i9DD$`#QBc|As@MxBVL)#lJq+kSSV+enU2 z06a^3mM41gR7Y``Q!q)*|C+02hxN>$86)g=iiT^QMCatV^3I=Q-w`4gOfY8xToZ9gmzX}V95Z6bzNED8&*GYh*%-)(lY5{X*sk7&AK08bC%&j+ ztQ}8@PJB`HfoVK02O$>4dXIqVk1N0BEXD#{&|uxd7S}TdNC7rulmE>=fqubQw{-AcE{dy?+oT^Gt_po`AS8X}gXQFSMz!E20f4Y^z*zoK|&l4oZKhB=bfVHb1*>XG=O@AoE>b+_& zmkY}jBi8%~>(dkib`zr!L~9itUo2XpfVEj@u81iQRgONu^!#dxIyE?C?_j{0D8>9L zUBZpy2M_oppv%kKnz;v_N%)#n8nC4CVd+ zj5qBkyQGg9Zy#q0(I;NnnWB9Kr)Dme%}Qh!f#8;P6u|6>CUJP$?D%75P1K9Q3yyhg z$`(&}WOK2{-oZ`IA=HiRv3_?cOhS+)#vd<3D94u}@OC&dfmVW1MJCrWuz+X`f~u^M zVymWTSA&f%QCO8(g`CTrrN}2Z@LVOXWzJ%8FLPE2JFVn)xy%8Kq_$?WIoHkFatjv6p2#jN$#kL{ zF|W4URK`c_odpS=D7<}37ln2^)ICe%Tu=C?P|9`2&+a0G_V@C?c2?zQafR^zf9&;R zHoj>7yW@)@PHB9t5g`^Dpp{c@&sN*S)|O%)h`F z%%i|Gx>SMl+5g9L%+G}dFcQt*-M9d!;-jH<8}6C;)XZV%b$?{$<1>FebMMS2aXYCq z4BcH1YOmfD9J}|ys6T!_eog$Qg27YuUW6-w1$@uppyP&wFuQyAKJ(uYtCvl^LehaN zpIn_l>kRu+_a|(VBtbBxHz~lx!anmVB6y8NOrcOm%$$KAQ6F3{qS4WrBe?Y47k9&l z&u@ymnsQgH8{aM%mq7ZKbwPI+7imI7x|0?i{0i>X^s4}qxRD?`S*^ER-(JaYW8 zI!qK|W^BmLC}bx>U^t?Ib{nZqN&bXx!2fTuI_sk*9SA7<#|6h%Y-oxw1c!E|g# z@uOpY8*Y`9_e;GSXlh9?a}Ql?%1Y#@!tppW&m2x%Ge>F~b|elvf-tEDLxftSAtnsg zBU;#*7SEp6sM305&&;PrrMQ_-`RL3R0b(VHaf$$KU@;-y;SWiqh)<20)36ppJK_fS z3VsdG+>bKh@obyoR`WgRs6AHw`|y|&ci6K;TK^F*%#4K@FA?(V1|VswbHn@|sFXKG z@dR%SgD(d^sL8?0heqZk&h`b9+p0wZ8P!$s;k^}%~Y%A8rB8L|_qIQjE-wZ>oAOC z=3#8A1gw+n;Vc+%ye__H4^5Q38&8-9*-_`+v0kQG-~n_TRnBl*KTWOq^`65U4I=Xk za6$g7nrUFsb<6Qi$pQAA9;8RAD2D^7@4F{>(?l=IyeGMA-5e5{^xAcr*vNtsH_-?u zGq5d0>L7Q9;vS2bLXj;;nrJZ<=}^bNM6RXp6>arvJ93t+#wu$}`ZcG9f_AXVUW2;` zlv(y>P|tQ$nr{GWPg#r~h;d3AdYw4ap`a#8ovvMH4Z6fBHIIOz8$b{BGe6u$&-XzO z_3$0o5|8??P4`ejS;us!ePerl6)Jr2VgHbUaUWN^;VU^N%{L$QKVFZ5wZX|fIwZD z$Pu-@^uC-Q;A;rpPwy+>!qV=Q47ir=tobFnuOqmg0Xg$px?e@WFR}3mh#yhg>3%gq zm0%Zv=u+m*8|i)%!EFRn1Py{;0{|=W&2;}dfvgGsAG*c~ZY6jd!9jq*lZA~6Ey1|2 z*m+`S!E73e_dnok&Q72VC>S-E9Tjnpl%0B3mlW;nop5!RB(vB|DGkfHP^MV1b0tYw zuHjrs67|P9lO*O+=S-4t4(3df?DgkNlI+gsOp@%UF=!Gd^2Hp2ZzY(Ef4mKAz2IJ? zApdv~lmTf*VL1%qF3;q}jp@geXjMA0r%8s`4hxCoJh4eX6C%T~74J6)m@9vZ`fuy$okdkE>Ja zBi=Hd#SD58M1d4e+Z89m7elO^In)Ee7S^6PJqA&-HI#-Gn~H!Z7ULX2R?k*gjm122 z@g^W=R8H35?> zIssM3iYhun&v6rV6;*N1YggNgDXHwjNpH~x+*LC{GhzNxUC_=4<#jI<{Yf>89e@;y z(qLOQ#?tVipg00JRvok^XyxMp#j;(3Tm*1weauQyQlhA+%9ULSE_V_t(^y+sF*WHj zy+$<A6St37V&q$esX ztxhFskI(?Dj73AmiYcNG$uWrFu!!puDLcl`Nc(aY84>2RfJ4aWrhU1ooAxEUB2~Gc zy4t0Tci4R%#?6>PXAR>KVZYbUK=Trtf7P+vs)n6#$%7m!NN$o(TM{xCA$eGgnoUgI zhXw0Zf#q35C}0)S4nUzYwTAc(KQ{b2^_)gs^&$k2%Q5w)#dp$`)FG~j z8AL>u)R$2XM2lyMXfX#^!jE=yT-eO%6=lv-B=s;7w*3J^&t>=p>i~`|YV-;aAr`$N zCi~uS;OZtWe@z64dw@uW0^d{{cOv4$D2u*??b42JFJ`R82}wwD8N_j=aX+P7!sjn` zcrGbc&PUE+`zig_W5xWlQFN^75bGx6I0>UwWgLfKv*yDyAA`M`b8tg0Zu+fP?6Yd0 zxE@);8~TJjW>|zK9AjESb;F(r+zmp;HYFBN-H3%7u`nr(kYR}hR5xPbM(nV}LdF2h zfQSV|EFfZeeKQ0o2yk%ivV%kp?q_)jm4O3>=wQIv!EH8MGYo*4Y-h4Ic!3SBz+B zgBJt&@s#xPWDf*5;KJ|#OE!5*eq3EdZ>};~K`uV5k4+C67>I}ZBxO}o;=z=7Fpj0d zhllzkC8{ogni4+5xv;~8&cODCk?kNLDjqF_X_jsiIp0^Tvs+Cu&SO0z%=fYHDkkLW_@;KFf$NpxR@eN~xkw76BO znwdkU0Y~Kf7^D;+K8xBCAZgVmtsyBo*;FMdd`OBZgrI7^FC{_5V*VXpD)U2ve*?g1 zAw?~wYY{K(NfF>>#MRxOMUC3z5L94}m6+d@l0#tU=7>5UxXz-shx1_a#2GSq;sk^| zVQ$VUI`=~fGO_0aH`HqTsVeL(fMnuI`r!t@+Fa?K1db<^XQ#y&s&}%c#jZ~jPpv(l zE1`^RaT1U0`feq+Il>E$Tn@X(_e8lx6*msCWj!&kvIGZmK!p}oB-*JvCBolE-FspS zJ7^auPrKOF_Rvr|sX~kJCTW`R&g@u<8{_h@lA?D>Upq}TRk^v;TP*6P$t|WdNS-mD zXUjxsRACc*!}bsuL+%LSt#$2b^0ZDs7E8NgBlx`7pcM?NENO{`#_EPV0Oo~a-*KMj{`dv1u<-%h5Z-Twi?{ta*jvn zeVuMw1Z7*d)ZDuB)44;v-U`Tzvanjj^k<~ft6S5nThp(U^f$Jp2mLm09#$xb^-M`c z`#n(%xd)Lu5B@(>m61w~WH0p2vSl-sx?#7-e>G~j3O!8=KGC>iU6 zArUZ`9KYM|#z$B~KWyF;yFX?Q$L>E7s~+ax@%c7~5UO@PRGnOo`QODrxxLUar)J?^ zK|pE7pdyc$hz`UchWQC&Xx0LD93CL0Um*A* z!IuCK`dPaFnjj?j2ZA3lgJWoLR#wwU_Bp!$hT!uAe@j5Y4U&DA?tcQn2BB(xK;MTD zbHq^+NF1$5FkBWnvfTLr^A=O;D$)p|w0B4Jp->PsC()*yDJ zk7>{08Z7F9qaau8IY8i$vNn)~8y21cpLC02b!)*2RAHsSr!!KA5aBP+o*inVZm^0W?rEuZahS>|F2nl2A>p&MX zrsCKWVNE5OPMA3rAJM%=?%SkUffG9P+j-_U#+T+%r&(di3v*mn*mH?9mC_SWq@>5K z;j>UopARc69v43sW|$uh>@Msu^Afx%;mJFPdD?dmPWv{-O>=VJ)ZiImVFbHn9|mvp z>ab53-lgB@!#6h)0a*dq+KaR5!JP6QbsE`kd)%{wIprhJj1x={7^Zy||YEl$y+ji>S>OyWg$9~nozz1eY&?*juaNRg|uC~=j0z=J!mlB0!QTuUZ)GK&&J zX>5oXTpDBd0MuJkccdegDg#$6b(BiYvx%=ytUxK)KkrzdstFGzMz|zjgjclv7EgQ^ zd8n&+ZjuN4X&g&zuh;&Jq`wloP{eVbo%&Kd=K0|-SIY6r@Z43Y!AQK0-?3jNP|X@y z1ag$>dXH+Us%UcnVS^=I7rg*(=%K_d(6JPPcEs!`E`gG{T9!b1Bet9tJiS|v=(yfo z=#r(-oFyi6=MguxT9P%=T&fr=a1CU@r4(02RF^10LPkZ2A}v+Hc1b^Mmqf?)BIOK7 z(L}KV_M4yx(OG@FvM6<3S=`9h-$_s*pj9X15zM)~xEirXxx5$vkyu`wg5^cV%|rf` zHH;f3@#(*Kfw2yu$6jEx$-GX!wqcsB-K}a+wS8QfOs_7I=o499q;j3``l4+V9rt_q z9R1=aN>9hF;m`QmhNP?F=?{{5yvsiu`PXAPF&&eM}yD9 z9StJn;Y|aPqcu#iP@wx#Zb>2M07D=4^W}*v0r<77%=~`iX4ufD8}gl*k5tq>^Fe>O z2BY`f219V+Rj@pOi~-Uec{G70C+J$#X9AYW%?+`kDE^r{nOn{_C{l5PqM48T;#BnJ ztHhoY?D*>L96s2CX+RejX>5F?t3F)%ai0iH$T=%G+75ObqCNDf1ABy_R*97d>`gizdM zLq4J*lrh;5jD^T4{XxoDUr;8Gnmk;ROTiT(iM0|h5#4=;R6+PfnHY((LP|7T&78qc z&shLSfW{>XLNo#(C3Or(lTjun;jpL^Gn9}H!ZoDuWDEkzsKuZ<%t4BmPy84PhXb)( zDqf5RiI#y2 zEg7mUWWuk}f)3ARY}|LDa>}a_e$;Q%OAeo~_S%;8d?7jHFUK#q1a*#<^d5w7_(9YK zwrHCG4$I}9LlA^;nV%9nL71i?B6iA&SjC7~B~Ot7?vO0tUSitRSp=bqgwgF4M5qE$ zUMwVu++?RmZnD$!Fq@n)k6|fY^!ie%)vc-3t*QHwn$~ub)Wl~~25O`*pJKsRqx%)tD2KBgOHy>U=MMQA5Mgl65O3WqpY%UfA9?_@a&Rcwo2WCab3 zLs?zoSndYaScEO0PgdxUBl5YqD8thULQs}FYi!a(OYGEm%XKKSmXjNcQ7}g9)eR*kW%FGuG4u^a|bB8 zK`1yIt>v1hn6qn{8Bj@KmTSHu>6g$Q`~dS<-a7~GoGHgOp*w{KB0i>Cqp6~iucdfr z>6n-KFiXp7m=|-jCxlVQ4gA_!hhoaf+V;7-gTGRxLhqY$DmSb1kEfi*&9Z8}erq@B z&5^8rgeZeCH{$yRCIG)|hHFI?BJE`q*#V*r0y&G}SE~*HcR+WI;1E(EbCd2d0DOa? zGbKw}72RYbl_oA(_c57BiX79T+b4Jl0fk7A;3Uh@{S3XCKqMMsNBjUw%c^K9G5h^N z^EAo+Gk%AkBAULG8zeQrbOle4v;Spe39Tw1aq=Xb|Ll^Y_XI?rq4Yc3=_r-vl-KW7*9Qy z;rC1=GACWvfT`D%K;~<}CfjFf3P+JA!Z3;RJQH@wf9VQWJ)SAyD_#Eoni1A$Wc*yX zGsceVZoHX3yez_-x&HuoG9MjWh#hjR4tpRjjB8MrQrrllcBHtWkf~NODxZE|O|KE8 z3!An|3xJx$*&1CnvDg5u;j8c-!C*qgT+W$(L)c3N-ddBIow2j_xKJVy_a6X0THHl}SW-LJGR*+kVjYnV{4;4;R6V{Ptvu;U3wbx-Lka$+D zB}&ZA26ng-cK?CIU5nzOu!(rI-Z1z$`mny5Fpe00{{co+piC_fiO1Ol8(*n29xA3# z{{1>?tZt2E==}#o3qw+q1ViO#rq&0s!Dq2adWJ9=cvHCJZ%5aA$vqS9+Fj>20Ee*wcUA-Et4XOW_lYc6ECu;TuiAzBJ9};83j?VKj(p%ngs1kP!F7 zR*j6?mb?XRzFO>K(wRGBFO%|CSji|HL5pWhx&*C_D0M52G35s;2941CV>Y)%MWzN( zp_1|{ShO-ks+9mOp#@lynY@*9*Q}fTk(oQKNBFRUQb(E)Cr;3Yf+8o93v$+z!NOBi ziX{mMj#4Y_j}x<2HO!Gxvb}t-B|{0EUhAq>rhu8F4f(CMKV4P9iY8+@ zo?7gxfWR?{WtG*%3Xe%~W%#I#?I-@&E{Gu$DuQy5(kbG%kUU z6D=yiG057GVT_>a6gAC*VF|zgKrjp+*<)2&(ZP+u=6?ePa?)F?v?N)D4M#kzgE(4I z2S3Rwqe;sBecfnx=>#&~NmIGXqt~sI$V3jMFBOeePtcWXE00)2IByuJ$o$fv81e@XeAbguGck5uP#k1xQ(c;7m zwRN<17SyQ9J$YBWknwEco?Q~rW(_N?wr#Nm=~d}{O_ZW-M6R*vsEpqJMo_iHop=do z`$OTtJuDo!he0EivEQ7c(S_A-9ev`mSI@p@Hpb3TiTCZI>B6CYzQf?IJva^i_n`&! z=}ZI_dtP))JMi6Tdl>Xx?+t>Q3)`sqrllsw>#@7!%uOsq&TnG&7y$Q=hkyoPK_2ss z<-wghOJzIR4L^rfw(-?ChR!uOWZbuyFZ28Jj|3sXHwfO(oF4+f7iy;{+;)+4EQ?1R|x)&;Hv;A;~(gLh~NM+Tn~2Ix9Isc!9Njvhd|V|zY7=Q zA7Q|w1pi9#7=bJp7n*;j`+EdGAov%8e)%h`9PZIou z;3)#~8lh9&%ZyJm;Ok7u>&i@q;D-$RcLJ`^t&TXoCB`}gqIlkG`smIQxCA+ZJORJk z=06Df=`9d&HH@ZNLibXD7JKT`?^6SkMOi=Vy8kq$XkYxBQyI35fU$#%&FOTnAUK0y zCBc~lWrDK^))1^ESV!=Df{O?)Cb)#)1q7E8i0qI{4)a2Kx!!1Tjlo^C-9+ci6uVvI zg&YUM4wYKme_+;qhvBlJh#Sz!=ZtNxE61}uJ0Y1P38P3A)aOW+C~<{zq)2f==SY#L zW$8UqhuI7?AP)_*4xM&ki3%II3tu!`XsYA&%5arBA44A0JyKQ!R1KXEoJoy)pjQzTkIjX71sjt0zhzU$t5NC6%-VxOb>1=)ajk4U5P7746g+Q zhjejA4#T z0FJF!M#Xhri$$~TUI}&;`f$;1Kc4+u`Qf?1JIPy!=YV%I&$zp3u3 zS8rlvg4V8GwVSH2%(Bh8aIGJ25A?t`Yet4=wusRgnC$41Z#MSHs!8r5ogdQv+JRS|y2Ukia-$ z6~3bj021m`ULn2|A+1)YVs|a&-j{MWstHBa+Kq{)miFvRdm8Y>c%hLNyK5=;zLdK$ zcC-{xJXVX{XwKM;){Na~%-D^#jNOft8!Z`!qaly>S^TVe>|txf9yVj_kyh-rd8*Z; zk+YUK_a)9oH7*NH6uZ$vu^SB(yU{)|!s=M;tR>EUiIZ)U>PqX@ew1OX1GJ=W4VQ6k z!`033upoF~+d~{^U3H*o)q!@kA!98Y5;@SE8p5_L5|1>bxTF=uB~2(UwjX5_c(Ygx z9a0`0QXU;ro`kR%35gt19z*yN6zPx`KwMHsammORmzF1WWqEW+d2~p5bVzv;!eS&O za!7d$k@D!!^61j?Bspp|RisPD2OJ2&|D(O)lvd8Ob*iv8BCTzo`|J}VMH-*OM7tw$ zaDYe%Mu+AFr&e*;=RW&{J@hmKi*hrOunt!1(AbqskSrA5F8d4%z=xa zWe5&DrJlgaC(GcIW$?)|B%T;0LFAA!BniuqG!_@jpr2((0z8Qt9mXg7i%)hJpX@F1 zqmRUIpX@67IT8zaO8e6x{#}mJfj!K79yYq$FP7SG>Njc#%lTK^{D*jEZIe zL@NNI5vZP8JxW?pWjxpdBvK!&Kl4zj7B!VT0x?cq;6#2SC+NUvhG>dJj;4#S?et8oMnx(tgp-3PQm^@KHIL$7_i8H zL(6;=tA5#{;;WAvA3zlpWp6y11w~U?(vt->$wn8c#>@`v83vsxHy|=6SUb4kC{&O~ z)=uGOfE%0Dsaz|1+|X)H!G_MrTC)L5|D!I$0TImWHmIHp0cds~LDjE_SdC$<5doyk zbg@Q$1Bh;@AYY58_NxBADVWGv3mThi&3|x9^yi?;)VSGxh;@>^`TYokajJ$~TwylC zcho`p|3vz!EU1K9Zyi;t8`;D^Ni@8*Lf8%nZywEp)}szu;Bf|zk{huaZ4XMY$dM;z z=A3#VEl05U*y>E-LhyZ~5DuwJ4IP@TAiax(=!>-inMvF*t5uyu4@^pTOgU?-s--M! zBT{UBOmLJyDks7b_JvspQ3gq^bnqF>$%xQf9uATII|)7sfZof4GQ8^2XTb=&vk^4c zV%P-DHkk>PZGc5(CJ&-;a%r1AP3Ytuz&~&pSKyL*2T}+a{=v%BX?dM#c_IDz%NB zta;Njqf==7GU$90-!@P6q~$${Bv5=qtefv9e3kUwgaiUI)Q&esr6cPhN1%K) z4}&k%6~2n;BCHq>kY`w}WnMe60XUJC?>o}+rquTL5{G;D7eTE%@dipZQh0elImRnE z@0%_H^L>)$B*6!FDh)NNVdx*y6q%Dmw|L>y7viltS+C_v+3w(6I@BK3Bl%6ej4!cP~P;`!CTBVjC%#as|kLY;57uVCD=;vN`h+%eu>~Zf^7uX z6TFIG7r}0V?F9dw;B^E$2yP%4BDj&DLa>uyn7|{b61;)njRfNa69k4JAowW39}xT@ z!T%r_A*d1TAs8j76O0k~1Py`@5`2i@!vr58coV^|5xkk;*9mSVcniT>34Vj%HiEYi z93=Qnf;$P`M{pOx`w2ck@cRUJ6C5G<3xdBS_zb~c5qy#0O9cF|n!h9X7{SL0{)pg@ z3GN~Ip9G&E_!EMA2|h{iDT4n+@TUa-o8UfzKO^{af*FGQ2|i8mS%SYN_#DCC5PY8C zZwVeC_yWO~3I3koD+FI92nqgy;30y4BzTzMYXn~>_y)l@3BE<}ZG!I*e3#p%DL zl;AOfeZdEnV4%m)6cM< z(90!^YN8+DkbfnCEQAIU`^;GT7%yBk)XuaX8XDebYIR&3Kh$gtiT)&oCT)yu@@IF0 z*BA^L4G+(om11r_w2ErZo|R)Z&leXvbQau&?~H48J5#lBdAr27acsjiv+{T4iH7IQ zO4p4&+oeUN71^9o(?YTs%Z$SQ;Sn=by=k}#a~_^{8cidGi&Ayv@zBYt7xqidBHu!Y z2jsF;Fsz4%aZEBiB~6h{X2htHXy?C$qji=rh-;m(~xMNAeS1Wr`S zXC4(l@c=+nBCRkXRuujF845c|?L$JOPHpRas?gz0(RKbByjGq)pNJ>4mtgoZa|rrP zJdcBC4i|m#2}4HzZ+l+?CFOOUSzYy4b#((xv+qzW0yQ9@n?(po2n0fV3j;!ONupKV zub_dZy7{YHxZ7Bk1VXZ`II^6KZQ1d(khKvjaV%NhWQ`|CIXRiwM?NQWPIt}ZoH(9w zoMgt6apEjW;^h18eg9TVQ__s*jOPqZ-FM&ezyJOB-S^&o+r2y+fOAt#oC9D+5XWe~ z2`7_q>j~$jl28<~jksx&^HWsZY5hqa76uS3@06^{xqg1A8?+;sTl{A znWqzn*5B8@mPQh46avAZ173s@ptbhGwYi3i zBGwot_-6m1pePXC>ZU|pg{ZI};Z<0L^@wm?vz86D6}=BL5|u_1!a2jjhh>A}fbNV& zWYtO|3c=PD6hCCP>GstDYzk$1^L-eXGAHtbz5TdhQN$&S2l|f(tcf!PC%o8xH5pRN zghE;W0(JUYxr26F)63c5MtIL)-|*6ApJ(Kv8r1VQ>=b2(rG^{I*D-#B9rGx`c>85y9Yb*3&rq)=h7}b4KYMgq4N%@>PwDz^=A)#tgCo!q^ITw`aShr_@rmX%8T?} zdUKL}n)YR0pC&yu`ZMg`OFB{M4F6=Rc7}@`d~soZU0fJn7k9dKXIOWpb!VB`5$jCs&(nlSa+#eCjRBtU4gr!j9+QpRpxT>w^?_!StI_n*1ZBM zOPPMXbvKxH@po8vquC_>&DPywt`z@P>t1EHiT`TrUSqBm|90zMXLg8xr**G4yTrfS zx;L0T;=j?lH<_Eoe~Wc*HMfcXcI$SUJH)@&x*st6#J}IV2h5$~KWN>%%-!Pupmnq6 z9`WC6-TTZT@poDGev=b_H(h&LuP3)0?BTbw15ou|#BCdBgBZ-Cn~d~hN1<(eYO8A z#u7;8w1?E6!Oeoa6cfGRU~f-9deeN}@V7U%n)RNudW8Sh?=e_ca9U1}K7fUjx5 zXQYqTps#DtXJv(3gTA3b&xjpf4f>`AeNI;IHRxL!^m+G)U&J>w=r?3}UsHWsgT5eZ z=Nj~;2L0Po!+-|-I}I2uHMkP+{f9K*S@%OApa$tTzH?sAD{9aqn(8?j&NS$Z27R&A za99J*YQXcQ_;IDl_aD`e7v4<_9x7_0a~kxby9@HC20f-hFO}j&``qIi@-i-uU_Vw< zeN4kHyc=J&N<*H|kl&QiRf9gRL0^(tvIae=L9e)LG0bYvuV~P3$vCV*pU|M+mO)yB ze)Z)B8I50#2eZ7gSyyRBm3ekQ2ivolio{xH>VO#@HscsVp?MbYpDk2vw+f*gIR1~0 z!W8g#9fgg+|G`n12LAVs!Y1H<=O}Cj{A-*DD4YTOzdH(N0{?H0!dbxo(or}Y_+K~*=K%k6MUchSPhB)nWJz%@E3h&|BIt=5%3>63Ks+aQ%B(v;6HE_W`O^Rqj34i zvaIKDt#w=+l7dy09lo*c=7BQ|LJ#rLh^k@<8#Q)Y|yOxPQZ}g@d zuvR3AGQu$8mvIc&xzOF50?GX7gTy{Su#eytf?El0Be6d%iESacl0eS3uOqgefcNTj$Dy5A2f;=H3I*;aVpN0WUL_dw+eS>* zbgw1Gk=fl&>^OQ2_c~%S0o*}sC&Bduy9jm@aKLqMAodYHv4_|w!)_#Y6T!^{`w0#Z zoM-Hv#JCFV9we3}xQAdV!vrh?#<8hl#qtG?O>ZLhmwIfPQKeU}_|#6^Q&>TYTt={x zU=_h4l4=RTVuB38QiA0K%LwF{`a)t02v!gT#Puoc-hlb+9Im;}N5;^GItLo%Z@@}x z6I8BaT{HdB*qZcJ=_@5&50XUwfn8!d@GDZ^O?@ZWO%qP3H=Aaz-<4YS4?x#PvDePN zbNkRLqysyE`DL{V zn7ia@a{m)Ma^8G0*LMU%#b7VS2$_S~WAP+JM>xY^sVhv?j!U1458l~b&pX2zR)1%3 zKjxA(`3qM0ZduqLlR-ex?H8l{wWv+0%2D*8X=t70s$bt|o5Jh(Fv%(ol|a%b5U0Kv z4&~_th)NUy;t*zd&q{X)FQVx4Ce{}c>+>-7*viA=iI+TPYk4g2rrEd zUwpOge)l21|6zhh07N^)QD`}3W}XKs3#?{WXz6?f`MYPCv=MoT{#gV(>IH}nS_GU! zKq++;i%}>JTq=r1HGnn6FRm(fc($AGa?f}1^Tj8uqyA@ z1cGV;pk4wgk0uaQ697d5R5Qv43bJ_O2tC15q$lGf2^2F>RSG`@id@wZeYh;)#g~Or z82VV)XAABt{0{4VMRlK`IEeUdFG30`Nl*+#eA$bT0)h?jVnV2c7)6w)`n4M+rGi#M z78iJbK@__J?FRe)CKG>&;I{xIl~uzkb@m!#zDn>q!Pg1CM({0wAuc(SuI{%zABpI` z>G?=8oNw&Tx!>S3-z0d0paW0J)8LGP`yB@TdjKz}sK~QW6UE zq8Wz`hLop%VKj*i$Ogpls(Cb~(JtgewX0mm#xg!_GGYdah5kq$wNhdmv(A*X^;ch> z$-eu^_!Dix;M3&(x({n5)T#qoqKI0DIfXawCHcc0D4x82DvmbwT+ zk~sRf4=Qm}qUOS>Yz26xj&>!rMP8bqwnQavq%ZKx-5{p-}NNy6|Tiwn(>y;N8SP}sAdJ~ zB<125%B6E-zigG&{#a^yrB~B7Jms#GR-@r;TXp1$J|lH<1IqX`$~XySJQpfs%H4u8 zu97mIuO~O8nP%cF7u1x1zV2%%(MtHJD26_As4aJ)V&o9}H>s0vqC{KRj;fC%2*2F4 zVPz7QH@R_(TBfSqHI>I4B62v@*F+IUt}+ERVLl_Fd*je9vgULM@@&7nn%Iu4fr# z4KrQ?yfmlFvekZ0Nb|05iE#&GmsG@}%FA_JxSNnvwwG$8k0`q+e;qRnEg;}zXVnzJ zbic++g4uqhM&K)1Ksj#cTXQU>2VA}S9$F4t#&ldUn?40n=W5ifD7(OkLRAI~MZ6BI zG{&G-!tQ<5>q}uXNY4$@&f{!6uRbdK^xelj!GD zD!!ObB^i7ce|%TeuH#}`mS#E0DCzhP{JeC0mxgB~3`#UA_X3k6GotlowQ5YNx^Vev zGVEsvK1=Wn!RH7*PatcSOIa5Ikg7R^*za=Lw-u3Cq~r;pgfj`hdC)=3*d9bg{;{p> z!0-L&Aa2=zDsMu|lGL#oEK6$Kds>#XkG$t)N#B0s|F;ME|DN_?%Vhp2i4j!BRUWJH z7hN+wxdK&_9EL6slff_Vh9Br1930rTetogff2upz*S8L;Tk(IWxUSzlvfj_6xW1?y zSZ#3-%}N&shcABd;`0|@_w7O+_YFg~U8Z&+U0f8sonu=k!5sixYcpJ)$srC)FfjK> zr=O0CTU0i|AQR*ZG&h47+|U)sRPJ8p!eCucAu8)9Gz`RaSRjM(DAm2m6L)G#@=g}u zAi@0%lM&IE1aOnnbpt~(RDKm&h2V}AjwId)#~mh+8ws>2>3*2tG{FIYwqz-Jc&M*$ z@Mq{>Vtp51hh+2+BgK^D!@we&?p#4(!d>_k`RNe-WZ@#bXyF@>JdBBW)PQ8%%~Grs zO_w~Y{UCfe^KFG=8hKQ_ej|@(#mHkLgyE3EMp$~OkuOeVzkc!wh|idG-?uwdzBj}? zWy<{#pTs1>H*WcnhQdt5K@XtZ@bpU9AQSlb$u~rxZLA{D+WzpDR-n~w&qq+xouV-g z0^CFJ1e^hJ@XKHZ_jdr=nnHFTA$IA5 zB+ueTbv6YYi+_SQ0>=WN;A!AE#VdFdaA@HYycu|lnJV}s;5f-E_+&F3Q@a_ZDSP|- z0=Z)3lfF=);dWl-;(*49fj+1-fJG48Rltg}$@CX81H(8gE?bZk*yUMF`(TNMd1m@@ zg(E|`BRF09Kn{2Nu}6~04Q|VDgeYQ&ID##_q3(4!wz>Ym&>(h5da*WMSU&)5TRH)d z{31OXDaGF|QtJ?a-=j5{n2AZxq)W1K+%P8=KkZ@Y-E5fSV=#bg!(ctfVvQEVeGYdW zUI7_?HJs99y)o3q>ruthH0v|iODYUec~xna4f1X#<=GB`l)Ri&)SwxD$|`{aeM2tv zRB`%@D@!)z5m-QGD04*9_U}iPbD`fj=7af)Gg0msE+2_5Itb48FFrCoQtNvE2jheP zkGM}~jy!`f0gU_n4Y<#L`RN#z8idx+aGVl9LP-1l;dFbE&avq@deDy$O?Mh;+ftqG zmyow8%G@3&Mt#nzBk>Hp+qq>Fob!C^vof#M;~HGUmji`wKO)XskK$Y*XMb=u^RcoH z<4rWY%SjWc!^5{NL`T5i$kHe!s*t;4a!ao$gwcNapd|@llXxj`ob?kHaa7Go(&Vojl{jPS18$Cl5eiymB7YWK}`3;1%la|x*Xbjw?_Zm2%BQ~<6DoulyQk$Z`hx|em zT}D9#R~Z_hWSewIMCe6kgqj10Aqi8RD=&eV4)Q|LsP}}0kz5R%46ZwzG4hMAEp475)H8%Jb5?xC+n2lty z_-xyxyrw11t2!$Ly#(9Dy-p|{7yzt=<#x*o=|~kf zSypJaRSX>xmKEfdqTkOk9jo3;H`!-}%KZKU>C!g8DM&{@LiD}DGA*5dEHualLUimekek2f)eCrC_tW+ooXg$Tg{a0G}DruZl-2unAR*VPiAMC z>Dk$4Ms|*wnVoB9W#^gM+4*Kpc7d6jU1;WI7n%9l#b!ZviCLJ2Tc_&FPbC`tcH-4@y zp|5<=br7f7Kkg*38Ag_);VUe*5 zUdTBw?zn_jnPs;>uGgDFq6R%F7r! zwF_%ww#;4F((arJQgPB7OSQl#Y)iw)qDTp#*&vcWLT?VuU~#2)HxSYry8w%Mqz*?e zxhRmLewWj6({Lui11=|5o`emoV;rgxZfhmUf!p_`;G=#?9&Lfv#ZXGtQLt*z$%3S9MB0>1 z>tJG(s-E^FCRI{rvUYV~kF1fjrE!WR@&&k<+MDk=UThb`B&9{=vDlqF+JnRUT^sxI z1)1{TdO%SYc?F^5UTQ{&$sayifRd@&IdmY0m+jiDR7VPp0A?M^>!aB#Q&)R0Yho@a z7n8Nw*$pZAbFi4A*vQ<-*PuT1bGq6qT8m#3q1JLRS`%2mjk{Y_E0XAD-6}PJT{6}N zYTyuBQxa*UWudN8LVwm$D2=5&CKfV^UyIU%{2bl24wAL%phH`V%qurD`4)gM#oUi0 zuiE75I7M>J^vMOYF8Fh3-UWX(N!3xCR6LDjJC>)Tq^e-rBvhtylR*}`arT=;A<3Q* zrsV>6pU<>Oo@qteBxmmxX}K)rF?i~D(#5%Oq?NVEa9VU`=#*Yg+ePv-B(p3j>g-Jx zfjNC`Yb-4-=W4Yl(naRwX39XjP`=bwo(_sSjtw2r9zOg`yS9V{8iv*p$19in#VktKL4FW~_<{DbWCadJUdur*23FuKJaEAjOUv6&MNYN3 zq}@++iCK*BG#EgoYG%Z?Rzs)F)N<+F3g46E^_565x|}g-^If{Xi%!&&StF zOP)G!(~?ilNBAU~FL~;Fep(0T(8D|xN^7Y6*lXTRKKH5ewNPo5``WOIEeb+Ku9nT3 zaz32T3JWvivT`O1GZvY#BNy}orTLYy@14UomDZNOv=(O%VB9XPuRFENt3HEOxpHPD zS9at+O}324LikbM*pnb_ewA&yA; z93ktXn#S@1I{bx^%9g2#ZsCzD+7i(282OGad%H8}K011Wy04q6J+G0fBCyCXNve6Y ztAlbqwJFDQr?!P}sM^B4-Z|Xsvo&wEy|bgcJklk@NM9WU=ne=_h5325w1--Y>LyP2 zcSxMcKFih3P~E>cvb2FL(rrj*0kSNw!56A(lzl=^NH{3jaYv*u8NU!cAz^0uZLyOU zsyIU6k>9UMndK1{g+ygXgz%HKN%Mcd#Nb|sMhc;zR$KIG-md*{U8A!+WTgVH%MNAa zRZjhR)h86lE2;|xLHFC~Re5{v9oBdqQbDcmHk4Iu7Pw@mL9u|tZ;eiats@vpC+IYM z0#_X|L$MzDFVSiEU)PcQVb!K_NwQ2d3SE*cm*`8~PV#N-E;RVwYF%h3_B~Uh`!C-o zH74jnw*({0>Uu^Nj42&|4b}HOqS@E68lGlp-Zk4khDy z?gF!QE=5wlzb5F5*&$K1uUPmC5dj0VPleK~NQ6HcRig`*%tIg3ky}N=1hPDI1+!DG zST>J5Q=0;}1r*p(n*yGo0$L@YpdF%1lqR6YO#wBeGvbopr6|%JqDaMxw^AsNRoObI z(!D-PmFYNTcqyt(&>0T~ROt;<#b2o2&9-G*?`}{72ZzfzgzJ6-{<@#HmY0zAptAH# zX{l$$K3&9XOH~T!A->s|lR->rxRou64XiZ3GKtS$o0zH^IUy;B7zR1B55JN$4VF@8 z3r!s?SqN<_1i{J-D%cLM*WH@W2Lyg9Okle_#bvX@==!*9c6~Xs>3TGX_8EVD0 z4Zw%a5;nyWc8QdYRMY5kDt3HONjw2q!N5VDwjyxuSlD?wsfp86ZyIHF*)4Usi*=bnj=C0#D-K$lxN6DT;~hq|sty-HJ%y28S@9e}n3orlbN;>ZS_M};I4CbcXW1s#Zxt153p zoC&k0ED+yCas+a%Fm#1DQ}I;AcT>dJg{M{z*^C$Niv9|>--EO}>X%Awt zo!}G(_R|mSK`$^79X3^^19wea**~e)+M^WNBosMBir}b}9dC7s2wmtzRM96RR3XzQ z@Dr-6MLlBoJJzL8hfcI=4xo0&j=0_3laNM%X@N}wn*~l1IQg{R*qFk--3D;lXp4_c zwRbYG2kgFq7+4b2+ZxBFVOcyjI^Fjtij1G3{=$6VGyOP6;)1kP9`h7Z%<@wtVfXGF zcsvd_gP7S~%;+2sJJyIe)a~e8KO}9RVho-hou@H{!@%eJshaS%V+e`n#G`wc&S{}7 z)9LW@!E>Vv{I}qniWk^Q@iw@l3%!`pMLxFJ!;Up0^*czl#0weCc-XN?i2GaQL@L1@ zU8*rqei1u1`Kfpjw8dNEE@`~XdunvKkFBs6lp>u@KmbjQobpdVWUxX9vucHmigtCF zRpYq{{)?z^FFOq3ADt)*M?1Sr_8PU-T*>xxBf(yR{RDRr+)L0!U8R}CBL^NT+kVC7lzTm2T`>(VPuG#FBTY{oX}T+y8>%di*38JC9k$mlke_CP zLI-K;Do}eJ1?sLzZU1E(D<`4$a>V20R_YiZFJK3H3DmLQh+JJ(}IIp zdCeC;4-yOi#nV&pHx<7b=|$c^}6eIcU7}~ z-eW)LvivvCz6dKoOOeCB1+^=4L5MbQy>s^U4X-}>=8FNJ{CyGCZ)PRH>g7i-uU>idO8x!%qJFV{q5fX|pnj=-xqfB!>R-P4=+#xR z`oY^5kBZel`tijNU#tGt%NG|#wcC|L^F~?k>g&2Kucl!vcl9uoi@IsoWz(+edR2DU zbzd&JaZ^_9@uoVKt9q=OU3u9}eRhAUd|^t?`_IBN_i>An~ruvxo_5+5n{ZmE=MY@I&=GS(+p(= z_bt?XyEvBXdMp{c11?)RTvlz{jRnG@5caHS?IW+pay9k#J#MCfJZv_5a9_67zApQE zUG*#94>qrM!QIS4vtrapb|{xYxRG#?Qrc9(5!@-0kkh6v>uR|vFQ+~@Sky~iCEL@X zYh?||4Gnyc-3|RR4fPE&a7vK=vYM1R<8Mozf_YW0t4{v5`?7{hDK08RPSsVp?qV3b zBdw)7lt+Y9(F-ye6)`W49jrc^`Jri-^3hFWseTnVn@Z7#idYEOhsiluH?W0DsAXff zXOdJ0o0o@OUBT3&x&DYo!0w_3t>7J=uB*Q3rlF`7YSyA=z>ILaF55ap zitOt>qp^8eE&C2tA7iKve?d2)kDw6W8{@I$Mbr+7Vgq%5(*!?MQPu@B9q+2$h%Tc0 zm<9Y$HZbYTYX_6BPtC{(8zc@xs1}n1lgMDztlDx{VR$#8$xKC#>$<6!A!HQ#y{QpK zfwanMT{SJ$w{?9e>v36XPD@p6H`=X2lg-flR>SC1A5zzr_Pm#;?xux$8O5`Xbp5{W zF{2x#h2bcz0WTmCNe&RI#t_GB;Fri9G7b`*bW%*L1kM}WVi>ffVHfT8*nRku7*{(M zdwq`7RE!!K^~}F5Xx0Ebj4-m@A%}&TIkkwft>3VN=o0#<&4f`bb!ubR$v0AfI)tYJ zBKz4RZJ3`PEx5rQOf62StoH||THZ9;a5wdCm>tl*8%FegRCzBOs-(Gv*t_sH@Ta?2 zOt=P`m4~Vijqi`C8fZ3hRV<~Fl@xXteh*V-=BQx*$LM)qVYZi(qqwPC%3&R%3!UeL zQQ+MUB$y-$TBS;b*=z4mW1X}dm918Uc}k$s1%@l2o6z{wGM$U2J~n&ohju~mzR0Gy zhn+G=Ez)xEMRzN#=Ayigr6r7u;t(VY zQg|fShwxT|Dtfbl9>#r{e4{$N>{n$&zQfJ6od&d5KzMBUd}CQ3P<02qhIANT*;g_W z8igY%STrwnqZF#9GFDxyEuEgD`Y(ULdXH?Uw(?Q)y8z7f0%aLH7s6`4Xb|n#Q=uDs zyu)ryqkwa!nscm{z6(k9CCW2&#}ZM*I@JyEgPLG|mJVYT05uAnZ%`LjfR3eHi-B%* zd+7Stv2YMx6Q@F3I1??Xf=vgY7Z~LQ9Mc(^g1XS;(7Es*rWzX(TDq$3S4_Y0s>Qw< z9__60B}w8KB*uaTWVJIHz}IGID{6~nwW^SCi%Ktd6JkH~7!Unp6ZDFM9qRRj?=sau zDlwg%6D;#{C5fL)A=sDKi0iuIk2l42MW@$@>AE^r<$5CKfd^M?xCBCbL<&qEKvH6w zL}5!*ht8L~&gBJ!$fmMlC1QD{;dDf*JBxwUz&d8sGC1pW`nw4yxnX>*Dlv4B1Ymd0 z#!xNbMo1-K0GTlZGa}S(hli1qe%%!dR)&~K1xiSUO;8`!lAlc2CxH}wsK7C({h>1+ zTM_;xQtIbaRW2J`A>s~%T*Xz9D3&p3Rd_GBc1cTIo$-AQd7t0G{+w!Yad`ad3XSV= zeCTt#Z1&jF07C|{@gxOS@YKg&Vy{mecHjR-YKk^`o zS)Va<{1{48tqGn%n}(;;QJC84e<%$nbnlCv_f)VISs-=*uvGx5ue!bn=b=9#OnDBa zi@|ZjG|B>*L#?B$4t@6qlfIf&e;#1yEgdALS}O7xM?q8k|0`#uS+{bI2z%EP3SMD) z@#u)B7UE@f#N?{BT4Bf(m5bEOvt6ZX;MEO+@frcDgeYAr|8lM2=kp+Z>+Rio&6YFa}!p3#{`W~Jf6!>*y?Pt(DuR_G{7x2m;} z0u*VaXVO=T#+vmKH@&KH1n90$=x^w46Qn|eXJ5IhSsWO6E)Bp#wPemrObmR+bcTq` zCc?s2jroQ(NB|EFJ`xzq)T+5;q1E}`r>uN6zmzpQ33s5|Vq=<2HdT>?;}whRl^(}+ zAQpjK3554c!ye1U%HUx3cOd1iS86Yli8>cNWT}>fv$jTE%)Se6@VxUPZ2`lNKU%32 zhPX9ul*h~*Fz9Dv<(Ebz!UskQmICLSRbfsZ0?mceGk(bIOyo==38*VpGKK*r0iA4S zvnB2`vMgxPPan%|6fs#dp2^Q1aW0(cIGs_FwIdTWT_SmLrl+9EMOkN8$$AD+6ZTBe zbd4GWolGT#OwVJ>Owh_?5R9bjLDC3`2en}ymlcLHE17w&_L;`BKHzjFM(WH)iqmq1 z$l+2%uav&gMEyW*QNMB6HMkfsxH4-r(5Q7Pr zb#o9=&=!GgX~;K+d%+611)zubp`To*>ac;1%kzG4|SIP$M!>Trm3q={7Jd z7LhQR7Qpf=_6Yto_P}UOHaE9e5$g{;E8%u5MadXRaU=xaHVkZiyu?){~ICKNDHivVpP0*^If(~d!A}f*YibRnmvW;5LV_v9HZklO& zzpKWqB7S+je7&Axo6hK|2h5c*5^mt#f&{FGB0{{{>}{2B_y?J1)p^CKIj0=@3Xhd& zMHh357g1yZ!8vxpB66sg3^cc#eKAzKYjSLS>dJ?y0;=R74>zGZ@6<(>02yaun`UV= z^dKL1;`W0x9iK=9AvpA)$DYDDi?SbGGr920f$xj$j&fOeo>J6j38eRiKL*S>$DS2Z ztSbhglbkCc9ijGZ3VIFtH#D}36P-AKai9MDW}Pfu&O3NTK^!ckIqsOp+{B;rt1vle z9436dz*hv@0+RMh3s~id75{aAcyvBQVqdG?8wR-DH`TMO!Y<*?fu)TS#>DUwwMAlf zo)*Wk1_<$IUQQ#0kY;>-mz&VJknU`+a3gU~-z?|8LN~RbnIscRZXFdlj*NEeP?abS zg6kxQJEtr;h-r=7Oa~90Z-WBsedJj7?aawcly74?&$N2hU1~Sy10ALnQ;35cYIjls zWYRAK=4K34jstwP$Low8bK!=a0!|32*>n7AMlkUw@|-XO!!vf!SrTK(BphVo{1QEh z5+;Af3eq|mam~s5TG{SQ(N)(P8FZ0?b#+T%O>|3OvgH5YJ*BOE^IwkV}`HmTh6uk0Ol#1=Unvp?3`;}U#=7LCfAi>#+;3pZ`9wI{lRiQhxstxQpc_+T?=6j+5Qy zU|Z0r^Z#nkWoN z42GSUw)d-gyd8@z-)Kt#t+!n(i=vUBW$9;wJ*)=ssEpDHi3v;&v=Le@idsLj3hW)P zVJNfY`!OL!84EZ&rxm`R7Xn=(4e-%C@nwb6dtBrWf+EO^gt9mAZ4nkgH1MP0#7?>o zU|z9Y^I7W}tGj0TxnWQRU+S1^KV8|D8j1XL$+-D&>4U%mMggY^*md1t z96Wpsi%$l8B){q6t3u~DnA@qirMFwY&Jn2h#q?+2M_Hk6IOY0`=5Ww}((g;^MeKC? z+{2Z!J$=NPC{ZRB;JQROXrM#Ps()2gc@d{D8pUPD85es&qMPt?&^!HtMiYwXnYsBj+YGCPkpgiClGtSY0wm=0Jzvm4Tn;}XpBGU}v~3(L#XZ$jRf38NCoUf@|BsWFcy z)N`+i_*f65ffy*y>TWtI^fE#=Y@$CU$C2NsJrG8jvNkqNayG3mv9KsKZ?W->YGo>4 zq)N?d6H1Uuqn&9HX9|&sb_}72it|`(8&eZRqDqV?vcZ>Q!hc}G!0eYCVYP9L+-0Y?vn7pKQ876I`v)!K)cH6Y6an1vDs=9BfvoMRUT zMY%zn@k;;;2v6#f&Q^$u>s3BxC~n5FAc)zr(m;b4=JYXL0WMEpn|RFXRmT;ufY<`( z;rL~WfIAi(ZcE`#_FV0Ple*n{OfEFPjiqGQtj*xmsLgP>Bg3gTMM#^6t5Y0K8jEoM z3Nxtp&=zuDlE~>c#r!8S??pF1F!Q(oiGUpVM76}$yJ9e_y10~A0k+9ffiN!7?Eo>< z%l`BjVn^Q}Im@8$K+!j1xI8_k{pm40Q-60Aho9LJNzo5*$LmN)ggm_zOqN%CK!|QF z(`$HxF%jjR*T$G{0c1p0=ZaUkK7FiO2ZH&)QQ9+mYk1N$J}ILwM4HX!PeE+=gf0eY))|?gLs4Y70(q9teExjFcb|TkfRd5fRM+>Q_eU` z0L2@e0s?Ujp`Oo?`2*Ppl~0d}89-lOA&WD%gDkOPyc*fX66&pdFnX{11SX8CJ-TS& z&z(t}fp$g9DJ{}(=^67&bS^+mEXWEScNN(olr2ps<3&f-s)_(jyFEQtu@Z$6MG^*h zfdD}9HWaKN=bNur*UXPD^0~Bi?_c~*tr(o!KfI@F3>QTS!U2r43*``!-V@yuj#8zs z$n#m9`RHTLhl&F@hCEbE9W#wea*vePxJ2h#1UZo44If35NYR*Lpoz@G{d~*?Dffng zO!}6I6;Obw@C1#kDFjYdd|xD8i{=uv{F(y!h7!8ZSZz! zHHwUE6Fnx2-@?bJhj1);?}K@3 z)~AoXd9RuBgp=a#U*W<>JPt(0i4?J^kSuW;T?L$1nn4tzRK*$6eH=+- zX#fc&gG6H|AS@c%IrK%1`@awC2%nAV=!Z3&&YvA4xl%ckJnZ|x?&?>&|gXux8)35?%m$uFC&u>qqzFZ?c*uIXH`=eC9qy|KE-T0XDcEYh)d z7p&5;QnN6^!j9;D*Q{6v9j?nT>lEGcupCydzPT_OQskP54`BK5oa=r)G{b8|qc6(^ zjFpNnbzHReGP|vT5O@9IhYiPAT3mlV3845%gE#`?cIg^(E4wY094ZPM)Kwp11jPbz2H3QN-ffxcubee}sh8~bW%GU(^SPaf$L3a`E zVJVX#?m`X2j#Pgx18evsoPH6y2=c}x7r*)8P2DR9knGehC1C?>8`>3OjtvAsA3?hD zyE@Ips>Ecd?UB;&hSs6q0HSIk!&QrYG74@BG&yDfRZjLuaAXh+iDW2sc}A(tb6r)L zL8#K6nTlyFRCP=^l2{Z4Fj#O0md=S@(LD}i2fv^@%Yu~PO=U>4w7i%v1JUvQu4No1tBmDw6Q-QWHA;~jo zM(0^Ly0Qv|i2Hhsl{>?986V_B6$sANGgwI7^Q$|K4)sMY5QGQv z)8`z*Vzp{5ql@p|(XS;$BB4nNTf_MU-ys-DX!8yMaovR%p;m~TiLfQCM}5g6pf=Tw z(wIz^PmQN3PoD*A5P-ED59Abw3eAPakP%%jA+TiGVWwyGj8~zDDLeB{u-zsxE-}Yt z(O^4kqR37Hb?7~3TBGDUQZ{x9m}&fM=w^zMGBrI0GGec)_iR0qzz<3yG@LA7tmgXO z?NAB%q<}*rk$s@uD2YUMHZ$qqQPHD0DQB7VN>LEjI<(at%rPpOsb|#CAm-T*i;K}P zLRzK1N-|6-JHocYwx!A-a1D4jR7EdA;EN_BGp9M)u!^H8e9=r=oVCq52ba{HmB68e z$4#vY!a*V&9Y0&u-J~@j_Y{A;R~P1G8>%0eti1O-rA=X>nU!?OXZar+utc+yoLF0Z-mEZ!K)mUwZhX4=x3Q z;A5%Xo`>t*t7Z4(=jHO2t6MBEVmMpg*82$viCg(W0qZTgOiw(#W#3F<3`~-%HMe!i z!Sa>`4*1)5M8gvB5m>kweT^9YK_aUIoHwu?K9Osv0*jW6c7_4yRhpVqbQ(U-Sn1HCv%+=8|oC<`F~ zqElcKV7ZAme8A3k@XYz+7Jh|u*==SdxIHsQlJiMG)Ktb-Xas}x6McNHhcVJ^1p|SQ zU?!e?nR|ZdAzzMSTpx+;0ex*Fs`xD-L%kv_mxuIqutuGzlnBrTTywBawpc*J>qPw ziHWeW^ANLJo5!2-(6B?O0ADaE`wk%jb`3vv2HU=K9)xq@lW#3OeZ?8;T2?FKwf>w2X>}p3!d3R+)DEA^$JBE!Q&Gjdn_nSg(U=e@ofn)pVoG^DS3DN!-Aw zHx{_b$PZ}*4c8OAis_?s0Pl6C{K%VO1DpDW*4)pG6rCP$Zlt}rNkS@tC1l>F$b_42 z+W!H+fp@w+Cu~mWsC6v+4#g;XeqwFI{`8dz8sflwSR0UA-ZsJCv3usn zyy20!0qDZ9@O7s{91dK0S_C?M1rBEvM~t2@VjcjNU;xtxVitj_1$Qdg8u7;5-_T3lYcgwrasPJj|^u2ub`0m&DHJ-kI z_vzgiPd_Lj^1;)0qk@EJcdmUd`+z>~Xd84&`?~wePCt11GjnfQ{w}GzPwes|X_R;0 zBu^#%Jhu38=93Y-d{4&Sww`SmhcwnR|!-CH(Zgvbg(FHt&ZQnWsOS>D+xo zga1qeX})(qD4W7-7Agqj5i9PuvPib=knWMS{V@kQ zT<9}hA-(rgKJO))QKSIp$#fV@j9rtQHG<@?mCvtb&<;Z$GW4F3dn<<)gFQ=+jvWls z@-DOWPUe`2zX@h@puvB#`!^+1N#&o#>$@)*L1ST7H1LA;NHMg*|#B2iH+~HTsl50B*auQ-=Bx} z>!cMmF`GL26Uyh9FpH=cBckOj`QMzC@Lp^o0O`*| z#qc3Z?@WyHO3k1G9nMw4ML05zAH)2T1_%{Cqc>WHHww)UUKK1VrAx7=|HfSZttu9l z!BC&Vxwf7{S2$2auIkCJLYb-aR%opfiV2w$>RL=N3JDn-$SV#@1j~0ZmtXkqOX&WF zVtmZ|_NASsGR%Eg^w!-kWllfvg>@YjZ9ju&G$yT6;>jUq6azzhWW?&~{>-C{A^VxX zx9u$W(SXoex|k)346H~$AK}pp+CwU9e0ukZq6qu2551|3!NbQPsSiUnUe2JuTp;?e|#;)kz(_Yz-#nql+Z%XP~k z2#J?CK<2B=cQ5mk-*`jZ*pCxG+ZbvJ2~VPW!xGQQ+IKG<_@r(#&M_9va`^)P+SmC% z(_9k2_!FMtp922;V!YVC)c^iS>JKk|%s&anKk4=2kG{M3;S1lr`XB!IM}PX0KmOBy z_hT}zc=SGTQvLZw{~8Vd@6|)TM>N$#?se1a!+*;U1RnmUo``EX4iAZgo8gZ*R$Xo% z@~?<+)41klp+A;ck>#TVyRnBOpXE7Z+CCgOQm!8I%M^Y>-xKf&^P;y}uLOF+ znv4Dw!K;EF2y7W6dk*|#%D#K$U-1EA+SUKsro+DgJosl1{^f(>!S6lz=MR4W!9RKM S2M>Pm;tPN8|NrySYyS^Q92n{V literal 0 HcmV?d00001 diff --git a/summa/preprocessing/__pycache__/textcleaner.cpython-38.pyc b/summa/preprocessing/__pycache__/textcleaner.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ec345bd17aa32d1e7d04e2ec50bc2758c610ed8 GIT binary patch literal 6821 zcmbVQTXWmS6~=-91yWbbvSr1I1K-71rhH49iy~XrrE$ilT~SIJNGc0)A(|El&<+*QpX}_}|DUx#9X(?f`yVyN@_FTR_XV-^^ zvkHEHi@q%HPAbYjsL}m1(71;u_&ZY+u5jimR`O;RlWo;f9rYl^)k9`!jdSQrxq2;a zrJ3Sr2LoI?WK7|@Wqhr87c|Am>}JI_Q|M2APs{HEmhp|^sXtF>XRRzZc*Y&94Ov5& zZSbr+TpO`QP*?b%J6apF#!w%!PB__vaXyT-Ps-W|Y7??HirOhz8$<21tetR9Oe%bQ zQbEt8>^a%hbA~f}3eP)yf}i?UvCi_-d=l?>`58Wi_j~*ueirX@{9XPY-tY5s{C&Jl zZu0YZpXV3&MZ7Oa%9l`Hbd+b84%unN$v(S$s9IO}Wqt)CuF4T8W$!ef<~j7{_y_zN z-XHK8ejV>?JkMwGp5Zt6O}wx3Tl_ZOc{%$|YxXR^$3NsBeaoyHe1ZRje~j8qexHBJ zKSS*nU*vQAbJTA0B|gvZqIQR~qLN$wp7dO3oDKGBH9KDyPF?tACkU!uC0}+O+jB&Y z#i`J)H0H0??J#tNch#)h#fvrz%cZ3PQjB(ic1^ZSR*{ z+pFw>SBxJn?t7tK4y)yLuNwZq?lrDVZx-{{?r!AI-`&bh-~D_uxE5W?MM*DpY-G34 z(U_XvSh#L&ZP?cvTfe$FbE~jfz_eItAW|!>Ho_!+xXHGc`W^K=|2KjAAly_&tr_M`Q z%2R(YfQ@l9VJS`!&W6sbP}tt?tTf2k9)px0TSl3$@5d8?RJPQ8N6%MDKqM(e&9XC0 zjG|Unsonae@TT2LNq&q z2oW#h2_{fHg#xyQnk-^Mi`XC4=UOq5lN8n)HCFRCV)jzDl41n2;>;ZORrYK3`P)Q^ z_L5dq@Z6Y5HvAUr&d<4i*>;2Zd}ne_6S^d~Bm9QrVQQ=gb+;PEnc~{xCtp8aT`J~M zVu;2}*qm3xs_)tEk@v*9bfSLtrhc3WVwcVeMKzd)KQV=RkMP?vlEOcYuZi%P60%5% z*lU%ry{Nolg`6r?CrO}AYzmxBT>=zRS|_UTazhtv>z{y1OpJSlh18Kf-QR=PKo)>g!yE7xF#;00~mF(?p;k>BYq|qQAmYCRD zGr%~Fq!njcNbm4|K1TbN`>f536zD~?dZa?oM3yE6I)(}fG-RO~rYh>|6l|WPoYUgd zZT?8Cy-4=ETe907-pDe5{tUX?hGYj`i#pS{SRI{iqEUrE-eDmlH6X!vqk5BPF z`tIQgqft7 zYL`9GZiUKEmD`1t+A#a=?6-binK$usXd>F02hSY>gG*6 zw_sAKU~d~(O|pnH9r;;l;Z&Sgfi>7=Y%w~6JvX#;`LUm8+V2;50vY(RAvmkSMwpaC zj|kfiTc*M1P=v|>T=fthvNb}`Mi}`O!no&Ia@<7Ksd#X8ne@1}Tv%L=jh(6ocyUCm zSA@S;ha<=>=rubmKkaAD#3D~{ekeQU-if>2cJrTU&%4)>(ne2t-PyP`v)k>0p*{gbW# zVfma|h-vvOmM^U1zX9y)Aea{9*ZoXq<|>ZoysC@& zZK6hIt^uwL0G4oS{tM^mEa^h5_8eG2hKQb`;yM)+LL`Mm_OX56`+C=ykL?L$NVo$?!H2kJr;WeDFUEAjN{Bz9vCmyA7F4;s$hCba2N(LTVn)YxKj|5UW z_u+ZN#aX4cxSV z`bq2%Sf_i0UJ8BuW08dV1DYcq!kwMZSYpU|Q9;pPnxpd=B*2HgBB7kX^%iuV{R1=YC%|xtkR;n)?&m_!4nkk1{ zslv*V_@ti~(k$^pLJCY2fU7e{ZpoG1R2OKGXX7T~k%#Cp9^CV)WuH47Nl0vz?a-8P69rbTsLmV^W`Fk3=w z=_;5p{d$roU@Y~{N4{He;*<;LCDV&_ zgiFq=I1LAPJf6!~BS@z!RZ7mFjxexB5}$T>(ndgUSAtlVA7fUPQWF&33GyEa-~G}$ zg+QZ%1G;@9l2H0Rh6i*gC=lBi!a^09oB^vGVrg|q6-w%PLDK6lC&Qw%}c0I zN))M-(6y{F1<9mfm0AdiL>i%Fjy0Z}YlK=*8_cbR3zVLiRWJ0-4%*F_679>BnRj zHs&Jthvhcf#+R7asuS)BkLdMW_?QHwc@tJeIia{n1%aR36)ci$?I_Q*#ydjqaAA#i z1mEfF7wn`=LuEor*H4{shoGJ$$EhqtYN(=wo-+Az`$rZ@1|Vg~QIRrCwPlFJMXfN{ z6(ksHk;ZXk^VD}FQynLyj(0jti^N^qH&}b4iXt{E)~KL>wfBH;9|1H*UaM5; z5Rlg@&tPMJI!e05G={VQKGw(^`jwNNBaY|) z$KyvSYU$zf+S>AJQGAIHg1|=FRy_E{La_vZ!?!}>jilwo7dz-Z z7BF^jk^*arawv(Gwy46ZRahdTweJYqkaN%>z9xQ&vXxkiGU`$$tz$7!wL`sq%#O7| zoP38i_E!u#$^ zk%FeF&kXM1YTGhiHB)MHY_@GwDWoL5xsS3w3g4RuQ>z=f8!z{1Ztcm3I zykp%GOeL0!%S3O0EUxN0;wD<+3KeusnaJfD)hNu8X+@H4(WRxl`Ralyy6q4of0?8U z6SX+)1ZBJKSc6@eYdo-Cc(C|*X<2+ogJm@NDb-|f*@-JJ5dl5` literal 0 HcmV?d00001 diff --git a/summa/preprocessing/__pycache__/util.cpython-38.pyc b/summa/preprocessing/__pycache__/util.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5979ffe72bc7e097d59bc1b4329cc139897049da GIT binary patch literal 701 zcmb7?y-ve05XYUQsG+D80|K#m?NaX*LP#(mh6;gLSdbe#ZD`^|XGd*OHsD!MpMf{Y z%3CmTPEyng7&ys}@5876{o=cBw?ok0i>n}X3Hh+jreTbn;nD}V2_l>qa!-USJp4MM zbxZm?frqKJ*>elmIWB#OTY+~BDV%3#kG#6aq;QW2-g2KVI~46kW-$+>re-Q>8VjoP zFpL&74eJO*lSuKHY6Fp)(D9OTs^q*Wt6Gd!PU*!`t-R7toBGnjUZsbXD*|nl@hMp% zMZ}rF60X<qw5q{%^T#YW)Z5MZA`#k}5pbjx0?b&X{oV>K2Z?E4$_ a-3m|TY}~bf4yp$JNs@{@mZ#rD{q`5g|FX6K literal 0 HcmV?d00001 diff --git a/summa/preprocessing/porter.py b/summa/preprocessing/porter.py new file mode 100644 index 0000000..9e06b59 --- /dev/null +++ b/summa/preprocessing/porter.py @@ -0,0 +1,635 @@ +# Adapted from the NLTK package v3.0.1: +# https://github.com/nltk/nltk/blob/3.0.1/nltk/stem/porter.py + +# Copyright (c) 2002 Vivake Gupta (vivakeATomniscia.org). All rights reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 +# USA +# +# This software is maintained by Vivake (vivakeATomniscia.org) and is available at: +# http://www.omniscia.org/~vivake/python/PorterStemmer.py +# +# Additional modifications were made to incorporate this module into +# NLTK. All such modifications are marked with "--NLTK--". The NLTK +# version of this module is maintained by NLTK developers, +# and is available via http://nltk.org/ +# +# GNU Linking Exception: +# Using this module statically or dynamically with other modules is +# making a combined work based on this module. Thus, the terms and +# conditions of the GNU General Public License cover the whole combination. +# As a special exception, the copyright holders of this module give +# you permission to combine this module with independent modules to +# produce an executable program, regardless of the license terms of these +# independent modules, and to copy and distribute the resulting +# program under terms of your choice, provided that you also meet, +# for each linked independent module, the terms and conditions of +# the license of that module. An independent module is a module which +# is not derived from or based on this module. If you modify this module, +# you may extend this exception to your version of the module, but you +# are not obliged to do so. If you do not wish to do so, delete this +# exception statement from your version. + +""" +Porter Stemmer + +This is the Porter stemming algorithm, ported to Python from the +version coded up in ANSI C by the author. It follows the algorithm +presented in + +Porter, M. "An algorithm for suffix stripping." Program 14.3 (1980): 130-137. + +only differing from it at the points marked --DEPARTURE-- and --NEW-- +below. + +For a more faithful version of the Porter algorithm, see + + http://www.tartarus.org/~martin/PorterStemmer/ + +Later additions: + + June 2000 + + The 'l' of the 'logi' -> 'log' rule is put with the stem, so that + short stems like 'geo' 'theo' etc work like 'archaeo' 'philo' etc. + + This follows a suggestion of Barry Wilkins, research student at + Birmingham. + + + February 2000 + + the cvc test for not dropping final -e now looks after vc at the + beginning of a word, so are, eve, ice, ore, use keep final -e. In this + test c is any consonant, including w, x and y. This extension was + suggested by Chris Emerson. + + -fully -> -ful treated like -fulness -> -ful, and + -tionally -> -tion treated like -tional -> -tion + + both in Step 2. These were suggested by Hiranmay Ghosh, of New Delhi. + + Invariants proceed, succeed, exceed. Also suggested by Hiranmay Ghosh. + +Additional modifications were made to incorperate this module into +nltk. All such modifications are marked with \"--NLTK--\". The nltk +version of this module is maintained by the NLTK developers, and is +available from +""" + + +## --NLTK-- +## Declare this module's documentation format. + +class PorterStemmer(): + + ## --NLTK-- + ## Add a module docstring + """ + A word stemmer based on the Porter stemming algorithm. + + Porter, M. \"An algorithm for suffix stripping.\" + Program 14.3 (1980): 130-137. + + A few minor modifications have been made to Porter's basic + algorithm. See the source code of this module for more + information. + + The Porter Stemmer requires that all tokens have string types. + """ + + # The main part of the stemming algorithm starts here. + # Note that only lower case sequences are stemmed. Forcing to lower case + # should be done before stem(...) is called. + + def __init__(self): + + ## --NEW-- + ## This is a table of irregular forms. It is quite short, but still + ## reflects the errors actually drawn to Martin Porter's attention over + ## a 20 year period! + ## + ## Extend it as necessary. + ## + ## The form of the table is: + ## { + ## "p1" : ["s11","s12","s13", ... ], + ## "p2" : ["s21","s22","s23", ... ], + ## ... + ## "pn" : ["sn1","sn2","sn3", ... ] + ## } + ## + ## String sij is mapped to paradigm form pi, and the main stemming + ## process is then bypassed. + + irregular_forms = { + "sky" : ["sky", "skies"], + "die" : ["dying"], + "lie" : ["lying"], + "tie" : ["tying"], + "news" : ["news"], + "inning" : ["innings", "inning"], + "outing" : ["outings", "outing"], + "canning" : ["cannings", "canning"], + "howe" : ["howe"], + + # --NEW-- + "proceed" : ["proceed"], + "exceed" : ["exceed"], + "succeed" : ["succeed"], # Hiranmay Ghosh + } + + self.pool = {} + for key in irregular_forms: + for val in irregular_forms[key]: + self.pool[val] = key + + self.vowels = frozenset(['a', 'e', 'i', 'o', 'u']) + + def _cons(self, word, i): + """cons(i) is TRUE <=> b[i] is a consonant.""" + if word[i] in self.vowels: + return False + if word[i] == 'y': + if i == 0: + return True + else: + return (not self._cons(word, i - 1)) + return True + + def _m(self, word, j): + """m() measures the number of consonant sequences between k0 and j. + if c is a consonant sequence and v a vowel sequence, and <..> + indicates arbitrary presence, + + gives 0 + vc gives 1 + vcvc gives 2 + vcvcvc gives 3 + .... + """ + n = 0 + i = 0 + while True: + if i > j: + return n + if not self._cons(word, i): + break + i = i + 1 + i = i + 1 + + while True: + while True: + if i > j: + return n + if self._cons(word, i): + break + i = i + 1 + i = i + 1 + n = n + 1 + + while True: + if i > j: + return n + if not self._cons(word, i): + break + i = i + 1 + i = i + 1 + + def _vowelinstem(self, stem): + """vowelinstem(stem) is TRUE <=> stem contains a vowel""" + for i in range(len(stem)): + if not self._cons(stem, i): + return True + return False + + def _doublec(self, word): + """doublec(word) is TRUE <=> word ends with a double consonant""" + if len(word) < 2: + return False + if (word[-1] != word[-2]): + return False + return self._cons(word, len(word)-1) + + def _cvc(self, word, i): + """cvc(i) is TRUE <=> + + a) ( --NEW--) i == 1, and word[0] word[1] is vowel consonant, or + + b) word[i - 2], word[i - 1], word[i] has the form consonant - + vowel - consonant and also if the second c is not w, x or y. this + is used when trying to restore an e at the end of a short word. + e.g. + + cav(e), lov(e), hop(e), crim(e), but + snow, box, tray. + """ + if i == 0: return False # i == 0 never happens perhaps + if i == 1: return (not self._cons(word, 0) and self._cons(word, 1)) + if not self._cons(word, i) or self._cons(word, i-1) or not self._cons(word, i-2): return False + + ch = word[i] + if ch == 'w' or ch == 'x' or ch == 'y': + return False + + return True + + def _step1ab(self, word): + """step1ab() gets rid of plurals and -ed or -ing. e.g. + + caresses -> caress + ponies -> poni + sties -> sti + tie -> tie (--NEW--: see below) + caress -> caress + cats -> cat + + feed -> feed + agreed -> agree + disabled -> disable + + matting -> mat + mating -> mate + meeting -> meet + milling -> mill + messing -> mess + + meetings -> meet + """ + if word[-1] == 's': + if word.endswith("sses"): + word = word[:-2] + elif word.endswith("ies"): + if len(word) == 4: + word = word[:-1] + # this line extends the original algorithm, so that + # 'flies'->'fli' but 'dies'->'die' etc + else: + word = word[:-2] + elif word[-2] != 's': + word = word[:-1] + + ed_or_ing_trimmed = False + if word.endswith("ied"): + if len(word) == 4: + word = word[:-1] + else: + word = word[:-2] + # this line extends the original algorithm, so that + # 'spied'->'spi' but 'died'->'die' etc + + elif word.endswith("eed"): + if self._m(word, len(word)-4) > 0: + word = word[:-1] + + + elif word.endswith("ed") and self._vowelinstem(word[:-2]): + word = word[:-2] + ed_or_ing_trimmed = True + elif word.endswith("ing") and self._vowelinstem(word[:-3]): + word = word[:-3] + ed_or_ing_trimmed = True + + if ed_or_ing_trimmed: + if word.endswith("at") or word.endswith("bl") or word.endswith("iz"): + word += 'e' + elif self._doublec(word): + if word[-1] not in ['l', 's', 'z']: + word = word[:-1] + elif (self._m(word, len(word)-1) == 1 and self._cvc(word, len(word)-1)): + word += 'e' + + return word + + def _step1c(self, word): + """step1c() turns terminal y to i when there is another vowel in the stem. + --NEW--: This has been modified from the original Porter algorithm so that y->i + is only done when y is preceded by a consonant, but not if the stem + is only a single consonant, i.e. + + (*c and not c) Y -> I + + So 'happy' -> 'happi', but + 'enjoy' -> 'enjoy' etc + + This is a much better rule. Formerly 'enjoy'->'enjoi' and 'enjoyment'-> + 'enjoy'. Step 1c is perhaps done too soon; but with this modification that + no longer really matters. + + Also, the removal of the vowelinstem(z) condition means that 'spy', 'fly', + 'try' ... stem to 'spi', 'fli', 'tri' and conflate with 'spied', 'tried', + 'flies' ... + """ + if word[-1] == 'y' and len(word) > 2 and self._cons(word, len(word) - 2): + return word[:-1] + 'i' + else: + return word + + def _step2(self, word): + """step2() maps double suffices to single ones. + so -ization ( = -ize plus -ation) maps to -ize etc. note that the + string before the suffix must give m() > 0. + """ + if len(word) <= 1: # Only possible at this stage given unusual inputs to stem_word like 'oed' + return word + + ch = word[-2] + + if ch == 'a': + if word.endswith("ational"): + return word[:-7] + "ate" if self._m(word, len(word)-8) > 0 else word + elif word.endswith("tional"): + return word[:-2] if self._m(word, len(word)-7) > 0 else word + else: + return word + elif ch == 'c': + if word.endswith("enci"): + return word[:-4] + "ence" if self._m(word, len(word)-5) > 0 else word + elif word.endswith("anci"): + return word[:-4] + "ance" if self._m(word, len(word)-5) > 0 else word + else: + return word + elif ch == 'e': + if word.endswith("izer"): + return word[:-1] if self._m(word, len(word)-5) > 0 else word + else: + return word + elif ch == 'l': + if word.endswith("bli"): + return word[:-3] + "ble" if self._m(word, len(word)-4) > 0 else word # --DEPARTURE-- + # To match the published algorithm, replace "bli" with "abli" and "ble" with "able" + elif word.endswith("alli"): + # --NEW-- + if self._m(word, len(word)-5) > 0: + word = word[:-2] + return self._step2(word) + else: + return word + elif word.endswith("fulli"): + return word[:-2] if self._m(word, len(word)-6) else word # --NEW-- + elif word.endswith("entli"): + return word[:-2] if self._m(word, len(word)-6) else word + elif word.endswith("eli"): + return word[:-2] if self._m(word, len(word)-4) else word + elif word.endswith("ousli"): + return word[:-2] if self._m(word, len(word)-6) else word + else: + return word + elif ch == 'o': + if word.endswith("ization"): + return word[:-7] + "ize" if self._m(word, len(word)-8) else word + elif word.endswith("ation"): + return word[:-5] + "ate" if self._m(word, len(word)-6) else word + elif word.endswith("ator"): + return word[:-4] + "ate" if self._m(word, len(word)-5) else word + else: + return word + elif ch == 's': + if word.endswith("alism"): + return word[:-3] if self._m(word, len(word)-6) else word + elif word.endswith("ness"): + if word.endswith("iveness"): + return word[:-4] if self._m(word, len(word)-8) else word + elif word.endswith("fulness"): + return word[:-4] if self._m(word, len(word)-8) else word + elif word.endswith("ousness"): + return word[:-4] if self._m(word, len(word)-8) else word + else: + return word + else: + return word + elif ch == 't': + if word.endswith("aliti"): + return word[:-3] if self._m(word, len(word)-6) else word + elif word.endswith("iviti"): + return word[:-5] + "ive" if self._m(word, len(word)-6) else word + elif word.endswith("biliti"): + return word[:-6] + "ble" if self._m(word, len(word)-7) else word + else: + return word + elif ch == 'g': # --DEPARTURE-- + if word.endswith("logi"): + return word[:-1] if self._m(word, len(word) - 4) else word # --NEW-- (Barry Wilkins) + # To match the published algorithm, pass len(word)-5 to _m instead of len(word)-4 + else: + return word + + else: + return word + + def _step3(self, word): + """step3() deals with -ic-, -full, -ness etc. similar strategy to step2.""" + + ch = word[-1] + + if ch == 'e': + if word.endswith("icate"): + return word[:-3] if self._m(word, len(word)-6) else word + elif word.endswith("ative"): + return word[:-5] if self._m(word, len(word)-6) else word + elif word.endswith("alize"): + return word[:-3] if self._m(word, len(word)-6) else word + else: + return word + elif ch == 'i': + if word.endswith("iciti"): + return word[:-3] if self._m(word, len(word)-6) else word + else: + return word + elif ch == 'l': + if word.endswith("ical"): + return word[:-2] if self._m(word, len(word)-5) else word + elif word.endswith("ful"): + return word[:-3] if self._m(word, len(word)-4) else word + else: + return word + elif ch == 's': + if word.endswith("ness"): + return word[:-4] if self._m(word, len(word)-5) else word + else: + return word + + else: + return word + + def _step4(self, word): + """step4() takes off -ant, -ence etc., in context vcvc.""" + + if len(word) <= 1: # Only possible at this stage given unusual inputs to stem_word like 'oed' + return word + + ch = word[-2] + + if ch == 'a': + if word.endswith("al"): + return word[:-2] if self._m(word, len(word)-3) > 1 else word + else: + return word + elif ch == 'c': + if word.endswith("ance"): + return word[:-4] if self._m(word, len(word)-5) > 1 else word + elif word.endswith("ence"): + return word[:-4] if self._m(word, len(word)-5) > 1 else word + else: + return word + elif ch == 'e': + if word.endswith("er"): + return word[:-2] if self._m(word, len(word)-3) > 1 else word + else: + return word + elif ch == 'i': + if word.endswith("ic"): + return word[:-2] if self._m(word, len(word)-3) > 1 else word + else: + return word + elif ch == 'l': + if word.endswith("able"): + return word[:-4] if self._m(word, len(word)-5) > 1 else word + elif word.endswith("ible"): + return word[:-4] if self._m(word, len(word)-5) > 1 else word + else: + return word + elif ch == 'n': + if word.endswith("ant"): + return word[:-3] if self._m(word, len(word)-4) > 1 else word + elif word.endswith("ement"): + return word[:-5] if self._m(word, len(word)-6) > 1 else word + elif word.endswith("ment"): + return word[:-4] if self._m(word, len(word)-5) > 1 else word + elif word.endswith("ent"): + return word[:-3] if self._m(word, len(word)-4) > 1 else word + else: + return word + elif ch == 'o': + if word.endswith("sion") or word.endswith("tion"): # slightly different logic to all the other cases + return word[:-3] if self._m(word, len(word)-4) > 1 else word + elif word.endswith("ou"): + return word[:-2] if self._m(word, len(word)-3) > 1 else word + else: + return word + elif ch == 's': + if word.endswith("ism"): + return word[:-3] if self._m(word, len(word)-4) > 1 else word + else: + return word + elif ch == 't': + if word.endswith("ate"): + return word[:-3] if self._m(word, len(word)-4) > 1 else word + elif word.endswith("iti"): + return word[:-3] if self._m(word, len(word)-4) > 1 else word + else: + return word + elif ch == 'u': + if word.endswith("ous"): + return word[:-3] if self._m(word, len(word)-4) > 1 else word + else: + return word + elif ch == 'v': + if word.endswith("ive"): + return word[:-3] if self._m(word, len(word)-4) > 1 else word + else: + return word + elif ch == 'z': + if word.endswith("ize"): + return word[:-3] if self._m(word, len(word)-4) > 1 else word + else: + return word + else: + return word + + def _step5(self, word): + """step5() removes a final -e if m() > 1, and changes -ll to -l if + m() > 1. + """ + if word[-1] == 'e': + a = self._m(word, len(word)-1) + if a > 1 or (a == 1 and not self._cvc(word, len(word)-2)): + word = word[:-1] + if word.endswith('ll') and self._m(word, len(word)-1) > 1: + word = word[:-1] + + return word + + def stem_word(self, p, i=0, j=None): + """ + Returns the stem of p, or, if i and j are given, the stem of p[i:j+1]. + """ + ## --NLTK-- + if j is None and i == 0: + word = p + else: + if j is None: + j = len(p) - 1 + word = p[i:j+1] + + if word in self.pool: + return self.pool[word] + + if len(word) <= 2: + return word # --DEPARTURE-- + # With this line, strings of length 1 or 2 don't go through the + # stemming process, although no mention is made of this in the + # published algorithm. Remove the line to match the published + # algorithm. + + word = self._step1ab(word) + word = self._step1c(word) + word = self._step2(word) + word = self._step3(word) + word = self._step4(word) + word = self._step5(word) + return word + + def _adjust_case(self, word, stem): + lower = word.lower() + + ret = "" + for x in range(len(stem)): + if lower[x] == stem[x]: + ret += word[x] + else: + ret += stem[x] + + return ret + + ## --NLTK-- + ## Don't use this procedure; we want to work with individual + ## tokens, instead. (commented out the following procedure) + #def stem(self, text): + # parts = re.split("(\W+)", text) + # numWords = (len(parts) + 1)/2 + # + # ret = "" + # for i in xrange(numWords): + # word = parts[2 * i] + # separator = "" + # if ((2 * i) + 1) < len(parts): + # separator = parts[(2 * i) + 1] + # + # stem = self.stem_word(string.lower(word), 0, len(word) - 1) + # ret = ret + self.adjust_case(word, stem) + # ret = ret + separator + # return ret + + ## --NLTK-- + ## Define a stem() method that implements the StemmerI interface. + def stem(self, word): + stem = self.stem_word(word.lower(), 0, len(word) - 1) + return self._adjust_case(word, stem) + + ## --NLTK-- + ## Add a string representation function + def __repr__(self): + return '' diff --git a/summa/preprocessing/snowball.py b/summa/preprocessing/snowball.py new file mode 100644 index 0000000..64a166f --- /dev/null +++ b/summa/preprocessing/snowball.py @@ -0,0 +1,4291 @@ +# Adapted from the NLTK package v3.0.1: +# https://github.com/nltk/nltk/blob/3.0.1/nltk/stem/snowball.py + +# +# Natural Language Toolkit: Snowball Stemmer +# +# Copyright (C) 2001-2014 NLTK Project +# Author: Peter Michael Stahl +# Peter Ljunglof (revisions) +# Algorithms: Dr Martin Porter +# URL: +# For license information, see LICENSE.TXT + +""" +Snowball stemmers + +This module provides a port of the Snowball stemmers +developed by Martin Porter. + +""" + +import re + +from .porter import PorterStemmer +from .util import prefix_replace, suffix_replace + + +class SnowballStemmer(): + + """ + Snowball Stemmer + + The following languages are supported: + Danish, Dutch, English, Finnish, French, German, + Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian, + Spanish and Swedish. + + The algorithm for English is documented here: + + Porter, M. \"An algorithm for suffix stripping.\" + Program 14.3 (1980): 130-137. + + The algorithms have been developed by Martin Porter. + These stemmers are called Snowball, because Porter created + a programming language with this name for creating + new stemming algorithms. There is more information available + at http://snowball.tartarus.org/ + + The stemmer is invoked as shown below: + + >>> from summa.preprocessing.snowball import SnowballStemmer + >>> print(" ".join(SnowballStemmer.languages)) # See which languages are supported + ... + >>> stemmer = SnowballStemmer("german") # Choose a language + >>> stemmer.stem("Autobahnen") # Stem a word + 'autobahn' + + Invoking the stemmers that way is useful if you do not know the + language to be stemmed at runtime. Alternatively, if you already know + the language, then you can invoke the language specific stemmer directly: + + >>> from summa.preprocessing.snowball import GermanStemmer + >>> stemmer = GermanStemmer() + >>> stemmer.stem("Autobahnen") + 'autobahn' + + :param language: The language whose subclass is instantiated. + :type language: str or unicode + :raise ValueError: If there is no stemmer for the specified + language, a ValueError is raised. + """ + + languages = ( + "arabic", + "danish", + "dutch", + "english", + "finnish", + "french", + "german", + "hungarian", + "italian", + "norwegian", + "polish", + "portuguese", + "romanian", + "russian", + "spanish", + "swedish", + ) + + def __init__(self, language): + if language not in self.languages: + raise ValueError("The language '%s' is not supported." % language) + stemmerclass = globals()[language.capitalize() + "Stemmer"] + self.stemmer = stemmerclass() + self.stem = self.stemmer.stem + + +class _LanguageSpecificStemmer(): + + """ + This helper subclass offers the possibility + to invoke a specific stemmer directly. + This is useful if you already know the language to be stemmed at runtime. + + Create an instance of the Snowball stemmer. + """ + + def __init__(self): + # The language is the name of the class, minus the final "Stemmer". + language = type(self).__name__.lower() + if language.endswith("stemmer"): + language = language[:-7] + + def __repr__(self): + """ + Print out the string representation of the respective class. + + """ + return "<%s>" % type(self).__name__ + + +class PorterStemmer(_LanguageSpecificStemmer, PorterStemmer): + """ + A word stemmer based on the original Porter stemming algorithm. + + Porter, M. \"An algorithm for suffix stripping.\" + Program 14.3 (1980): 130-137. + + A few minor modifications have been made to Porter's basic + algorithm. See the source code of the module + nltk.stem.porter for more information. + + """ + def __init__(self): + _LanguageSpecificStemmer.__init__(self) + PorterStemmer.__init__(self) + + +class _ScandinavianStemmer(_LanguageSpecificStemmer): + + """ + This subclass encapsulates a method for defining the string region R1. + It is used by the Danish, Norwegian, and Swedish stemmer. + + """ + + def _r1_scandinavian(self, word, vowels): + """ + Return the region R1 that is used by the Scandinavian stemmers. + + R1 is the region after the first non-vowel following a vowel, + or is the null region at the end of the word if there is no + such non-vowel. But then R1 is adjusted so that the region + before it contains at least three letters. + + :param word: The word whose region R1 is determined. + :type word: str or unicode + :param vowels: The vowels of the respective language that are + used to determine the region R1. + :type vowels: unicode + :return: the region R1 for the respective word. + :rtype: unicode + :note: This helper method is invoked by the respective stem method of + the subclasses DanishStemmer, NorwegianStemmer, and + SwedishStemmer. It is not to be invoked directly! + + """ + r1 = "" + for i in range(1, len(word)): + if word[i] not in vowels and word[i-1] in vowels: + if len(word[:i+1]) < 3 and len(word[:i+1]) > 0: + r1 = word[3:] + elif len(word[:i+1]) >= 3: + r1 = word[i+1:] + else: + return word + break + + return r1 + + + +class _StandardStemmer(_LanguageSpecificStemmer): + + """ + This subclass encapsulates two methods for defining the standard versions + of the string regions R1, R2, and RV. + + """ + + def _r1r2_standard(self, word, vowels): + """ + Return the standard interpretations of the string regions R1 and R2. + + R1 is the region after the first non-vowel following a vowel, + or is the null region at the end of the word if there is no + such non-vowel. + + R2 is the region after the first non-vowel following a vowel + in R1, or is the null region at the end of the word if there + is no such non-vowel. + + :param word: The word whose regions R1 and R2 are determined. + :type word: str or unicode + :param vowels: The vowels of the respective language that are + used to determine the regions R1 and R2. + :type vowels: unicode + :return: (r1,r2), the regions R1 and R2 for the respective word. + :rtype: tuple + :note: This helper method is invoked by the respective stem method of + the subclasses DutchStemmer, FinnishStemmer, + FrenchStemmer, GermanStemmer, ItalianStemmer, + PortugueseStemmer, RomanianStemmer, and SpanishStemmer. + It is not to be invoked directly! + :note: A detailed description of how to define R1 and R2 + can be found at http://snowball.tartarus.org/texts/r1r2.html + + """ + r1 = "" + r2 = "" + for i in range(1, len(word)): + if word[i] not in vowels and word[i-1] in vowels: + r1 = word[i+1:] + break + + for i in range(1, len(r1)): + if r1[i] not in vowels and r1[i-1] in vowels: + r2 = r1[i+1:] + break + + return (r1, r2) + + + + def _rv_standard(self, word, vowels): + """ + Return the standard interpretation of the string region RV. + + If the second letter is a consonant, RV is the region after the + next following vowel. If the first two letters are vowels, RV is + the region after the next following consonant. Otherwise, RV is + the region after the third letter. + + :param word: The word whose region RV is determined. + :type word: str or unicode + :param vowels: The vowels of the respective language that are + used to determine the region RV. + :type vowels: unicode + :return: the region RV for the respective word. + :rtype: unicode + :note: This helper method is invoked by the respective stem method of + the subclasses ItalianStemmer, PortugueseStemmer, + RomanianStemmer, and SpanishStemmer. It is not to be + invoked directly! + + """ + rv = "" + if len(word) >= 2: + if word[1] not in vowels: + for i in range(2, len(word)): + if word[i] in vowels: + rv = word[i+1:] + break + + elif word[:2] in vowels: + for i in range(2, len(word)): + if word[i] not in vowels: + rv = word[i+1:] + break + else: + rv = word[3:] + + return rv + + + +class DanishStemmer(_ScandinavianStemmer): + + """ + The Danish Snowball stemmer. + + :cvar __vowels: The Danish vowels. + :type __vowels: unicode + :cvar __consonants: The Danish consonants. + :type __consonants: unicode + :cvar __double_consonants: The Danish double consonants. + :type __double_consonants: tuple + :cvar __s_ending: Letters that may directly appear before a word final 's'. + :type __s_ending: unicode + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :note: A detailed description of the Danish + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/danish/stemmer.html + + """ + + # The language's vowels and other important characters are defined. + __vowels = "aeiouy\xE6\xE5\xF8" + __consonants = "bcdfghjklmnpqrstvwxz" + __double_consonants = ("bb", "cc", "dd", "ff", "gg", "hh", "jj", + "kk", "ll", "mm", "nn", "pp", "qq", "rr", + "ss", "tt", "vv", "ww", "xx", "zz") + __s_ending = "abcdfghjklmnoprtvyz\xE5" + + # The different suffixes, divided into the algorithm's steps + # and organized by length, are listed in tuples. + __step1_suffixes = ("erendes", "erende", "hedens", "ethed", + "erede", "heden", "heder", "endes", + "ernes", "erens", "erets", "ered", + "ende", "erne", "eren", "erer", "heds", + "enes", "eres", "eret", "hed", "ene", "ere", + "ens", "ers", "ets", "en", "er", "es", "et", + "e", "s") + __step2_suffixes = ("gd", "dt", "gt", "kt") + __step3_suffixes = ("elig", "l\xF8st", "lig", "els", "ig") + + def stem(self, word): + """ + Stem a Danish word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + # Every word is put into lower case for normalization. + word = word.lower() + + # After this, the required regions are generated + # by the respective helper method. + r1 = self._r1_scandinavian(word, self.__vowels) + + # Then the actual stemming process starts. + # Every new step is explicitly indicated + # according to the descriptions on the Snowball website. + + # STEP 1 + for suffix in self.__step1_suffixes: + if r1.endswith(suffix): + if suffix == "s": + if word[-2] in self.__s_ending: + word = word[:-1] + r1 = r1[:-1] + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + break + + # STEP 2 + for suffix in self.__step2_suffixes: + if r1.endswith(suffix): + word = word[:-1] + r1 = r1[:-1] + break + + # STEP 3 + if r1.endswith("igst"): + word = word[:-2] + r1 = r1[:-2] + + for suffix in self.__step3_suffixes: + if r1.endswith(suffix): + if suffix == "l\xF8st": + word = word[:-1] + r1 = r1[:-1] + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + + if r1.endswith(self.__step2_suffixes): + word = word[:-1] + r1 = r1[:-1] + break + + # STEP 4: Undouble + for double_cons in self.__double_consonants: + if word.endswith(double_cons) and len(word) > 3: + word = word[:-1] + break + + + return word + + +class DutchStemmer(_StandardStemmer): + + """ + The Dutch Snowball stemmer. + + :cvar __vowels: The Dutch vowels. + :type __vowels: unicode + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step3b_suffixes: Suffixes to be deleted in step 3b of the algorithm. + :type __step3b_suffixes: tuple + :note: A detailed description of the Dutch + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/dutch/stemmer.html + + """ + + __vowels = "aeiouy\xE8" + __step1_suffixes = ("heden", "ene", "en", "se", "s") + __step3b_suffixes = ("baar", "lijk", "bar", "end", "ing", "ig") + + def stem(self, word): + """ + Stem a Dutch word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + step2_success = False + + # Vowel accents are removed. + word = (word.replace("\xE4", "a").replace("\xE1", "a") + .replace("\xEB", "e").replace("\xE9", "e") + .replace("\xED", "i").replace("\xEF", "i") + .replace("\xF6", "o").replace("\xF3", "o") + .replace("\xFC", "u").replace("\xFA", "u")) + + # An initial 'y', a 'y' after a vowel, + # and an 'i' between self.__vowels is put into upper case. + # As from now these are treated as consonants. + if word.startswith("y"): + word = "".join(("Y", word[1:])) + + for i in range(1, len(word)): + if word[i-1] in self.__vowels and word[i] == "y": + word = "".join((word[:i], "Y", word[i+1:])) + + for i in range(1, len(word)-1): + if (word[i-1] in self.__vowels and word[i] == "i" and + word[i+1] in self.__vowels): + word = "".join((word[:i], "I", word[i+1:])) + + r1, r2 = self._r1r2_standard(word, self.__vowels) + + # R1 is adjusted so that the region before it + # contains at least 3 letters. + for i in range(1, len(word)): + if word[i] not in self.__vowels and word[i-1] in self.__vowels: + if len(word[:i+1]) < 3 and len(word[:i+1]) > 0: + r1 = word[3:] + elif len(word[:i+1]) == 0: + return word + break + + # STEP 1 + for suffix in self.__step1_suffixes: + if r1.endswith(suffix): + if suffix == "heden": + word = "".join((word[:-5], "heid")) + r1 = "".join((r1[:-5], "heid")) + if r2.endswith("heden"): + r2 = "".join((r2[:-5], "heid")) + + elif (suffix in ("ene", "en") and + not word.endswith("heden") and + word[-len(suffix)-1] not in self.__vowels and + word[-len(suffix)-3:-len(suffix)] != "gem"): + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + if word.endswith(("kk", "dd", "tt")): + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + + elif (suffix in ("se", "s") and + word[-len(suffix)-1] not in self.__vowels and + word[-len(suffix)-1] != "j"): + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + break + + # STEP 2 + if r1.endswith("e") and word[-2] not in self.__vowels: + step2_success = True + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + + if word.endswith(("kk", "dd", "tt")): + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + + # STEP 3a + if r2.endswith("heid") and word[-5] != "c": + word = word[:-4] + r1 = r1[:-4] + r2 = r2[:-4] + + if (r1.endswith("en") and word[-3] not in self.__vowels and + word[-5:-2] != "gem"): + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + if word.endswith(("kk", "dd", "tt")): + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + + # STEP 3b: Derivational suffixes + for suffix in self.__step3b_suffixes: + if r2.endswith(suffix): + if suffix in ("end", "ing"): + word = word[:-3] + r2 = r2[:-3] + + if r2.endswith("ig") and word[-3] != "e": + word = word[:-2] + else: + if word.endswith(("kk", "dd", "tt")): + word = word[:-1] + + elif suffix == "ig" and word[-3] != "e": + word = word[:-2] + + elif suffix == "lijk": + word = word[:-4] + r1 = r1[:-4] + + if r1.endswith("e") and word[-2] not in self.__vowels: + word = word[:-1] + if word.endswith(("kk", "dd", "tt")): + word = word[:-1] + + elif suffix == "baar": + word = word[:-4] + + elif suffix == "bar" and step2_success: + word = word[:-3] + break + + # STEP 4: Undouble vowel + if len(word) >= 4: + if word[-1] not in self.__vowels and word[-1] != "I": + if word[-3:-1] in ("aa", "ee", "oo", "uu"): + if word[-4] not in self.__vowels: + word = "".join((word[:-3], word[-3], word[-1])) + + # All occurrences of 'I' and 'Y' are put back into lower case. + word = word.replace("I", "i").replace("Y", "y") + + + return word + + + +class EnglishStemmer(_StandardStemmer): + + """ + The English Snowball stemmer. + + :cvar __vowels: The English vowels. + :type __vowels: unicode + :cvar __double_consonants: The English double consonants. + :type __double_consonants: tuple + :cvar __li_ending: Letters that may directly appear before a word final 'li'. + :type __li_ending: unicode + :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. + :type __step0_suffixes: tuple + :cvar __step1a_suffixes: Suffixes to be deleted in step 1a of the algorithm. + :type __step1a_suffixes: tuple + :cvar __step1b_suffixes: Suffixes to be deleted in step 1b of the algorithm. + :type __step1b_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. + :type __step4_suffixes: tuple + :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm. + :type __step5_suffixes: tuple + :cvar __special_words: A dictionary containing words + which have to be stemmed specially. + :type __special_words: dict + :note: A detailed description of the English + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/english/stemmer.html + """ + + __vowels = "aeiouy" + __double_consonants = ("bb", "dd", "ff", "gg", "mm", "nn", + "pp", "rr", "tt") + __li_ending = "cdeghkmnrt" + __step0_suffixes = ("'s'", "'s", "'") + __step1a_suffixes = ("sses", "ied", "ies", "us", "ss", "s") + __step1b_suffixes = ("eedly", "ingly", "edly", "eed", "ing", "ed") + __step2_suffixes = ('ization', 'ational', 'fulness', 'ousness', + 'iveness', 'tional', 'biliti', 'lessli', + 'entli', 'ation', 'alism', 'aliti', 'ousli', + 'iviti', 'fulli', 'enci', 'anci', 'abli', + 'izer', 'ator', 'alli', 'bli', 'ogi', 'li') + __step3_suffixes = ('ational', 'tional', 'alize', 'icate', 'iciti', + 'ative', 'ical', 'ness', 'ful') + __step4_suffixes = ('ement', 'ance', 'ence', 'able', 'ible', 'ment', + 'ant', 'ent', 'ism', 'ate', 'iti', 'ous', + 'ive', 'ize', 'ion', 'al', 'er', 'ic') + __step5_suffixes = ("e", "l") + __special_words = {"skis" : "ski", + "skies" : "sky", + "dying" : "die", + "lying" : "lie", + "tying" : "tie", + "idly" : "idl", + "gently" : "gentl", + "ugly" : "ugli", + "early" : "earli", + "only" : "onli", + "singly" : "singl", + "sky" : "sky", + "news" : "news", + "howe" : "howe", + "atlas" : "atlas", + "cosmos" : "cosmos", + "bias" : "bias", + "andes" : "andes", + "inning" : "inning", + "innings" : "inning", + "outing" : "outing", + "outings" : "outing", + "canning" : "canning", + "cannings" : "canning", + "herring" : "herring", + "herrings" : "herring", + "earring" : "earring", + "earrings" : "earring", + "proceed" : "proceed", + "proceeds" : "proceed", + "proceeded" : "proceed", + "proceeding" : "proceed", + "exceed" : "exceed", + "exceeds" : "exceed", + "exceeded" : "exceed", + "exceeding" : "exceed", + "succeed" : "succeed", + "succeeds" : "succeed", + "succeeded" : "succeed", + "succeeding" : "succeed"} + + def stem(self, word): + + """ + Stem an English word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + if len(word) <= 2: + return word + + elif word in self.__special_words: + return self.__special_words[word] + + # Map the different apostrophe characters to a single consistent one + word = (word.replace("\u2019", "\x27") + .replace("\u2018", "\x27") + .replace("\u201B", "\x27")) + + if word.startswith("\x27"): + word = word[1:] + + if word.startswith("y"): + word = "".join(("Y", word[1:])) + + for i in range(1, len(word)): + if word[i-1] in self.__vowels and word[i] == "y": + word = "".join((word[:i], "Y", word[i+1:])) + + step1a_vowel_found = False + step1b_vowel_found = False + + r1 = "" + r2 = "" + + if word.startswith(("gener", "commun", "arsen")): + if word.startswith(("gener", "arsen")): + r1 = word[5:] + else: + r1 = word[6:] + + for i in range(1, len(r1)): + if r1[i] not in self.__vowels and r1[i-1] in self.__vowels: + r2 = r1[i+1:] + break + else: + r1, r2 = self._r1r2_standard(word, self.__vowels) + + + # STEP 0 + for suffix in self.__step0_suffixes: + if word.endswith(suffix): + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + break + + # STEP 1a + for suffix in self.__step1a_suffixes: + if word.endswith(suffix): + + if suffix == "sses": + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix in ("ied", "ies"): + if len(word[:-len(suffix)]) > 1: + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + else: + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + + elif suffix == "s": + for letter in word[:-2]: + if letter in self.__vowels: + step1a_vowel_found = True + break + + if step1a_vowel_found: + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + break + + # STEP 1b + for suffix in self.__step1b_suffixes: + if word.endswith(suffix): + if suffix in ("eed", "eedly"): + + if r1.endswith(suffix): + word = "".join((word[:-len(suffix)], "ee")) + + if len(r1) >= len(suffix): + r1 = "".join((r1[:-len(suffix)], "ee")) + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = "".join((r2[:-len(suffix)], "ee")) + else: + r2 = "" + else: + for letter in word[:-len(suffix)]: + if letter in self.__vowels: + step1b_vowel_found = True + break + + if step1b_vowel_found: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + + if word.endswith(("at", "bl", "iz")): + word = "".join((word, "e")) + r1 = "".join((r1, "e")) + + if len(word) > 5 or len(r1) >=3: + r2 = "".join((r2, "e")) + + elif word.endswith(self.__double_consonants): + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + + elif ((r1 == "" and len(word) >= 3 and + word[-1] not in self.__vowels and + word[-1] not in "wxY" and + word[-2] in self.__vowels and + word[-3] not in self.__vowels) + or + (r1 == "" and len(word) == 2 and + word[0] in self.__vowels and + word[1] not in self.__vowels)): + + word = "".join((word, "e")) + + if len(r1) > 0: + r1 = "".join((r1, "e")) + + if len(r2) > 0: + r2 = "".join((r2, "e")) + break + + # STEP 1c + if len(word) > 2 and word[-1] in "yY" and word[-2] not in self.__vowels: + word = "".join((word[:-1], "i")) + if len(r1) >= 1: + r1 = "".join((r1[:-1], "i")) + else: + r1 = "" + + if len(r2) >= 1: + r2 = "".join((r2[:-1], "i")) + else: + r2 = "" + + # STEP 2 + for suffix in self.__step2_suffixes: + if word.endswith(suffix): + if r1.endswith(suffix): + if suffix == "tional": + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix in ("enci", "anci", "abli"): + word = "".join((word[:-1], "e")) + + if len(r1) >= 1: + r1 = "".join((r1[:-1], "e")) + else: + r1 = "" + + if len(r2) >= 1: + r2 = "".join((r2[:-1], "e")) + else: + r2 = "" + + elif suffix == "entli": + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix in ("izer", "ization"): + word = "".join((word[:-len(suffix)], "ize")) + + if len(r1) >= len(suffix): + r1 = "".join((r1[:-len(suffix)], "ize")) + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = "".join((r2[:-len(suffix)], "ize")) + else: + r2 = "" + + elif suffix in ("ational", "ation", "ator"): + word = "".join((word[:-len(suffix)], "ate")) + + if len(r1) >= len(suffix): + r1 = "".join((r1[:-len(suffix)], "ate")) + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = "".join((r2[:-len(suffix)], "ate")) + else: + r2 = "e" + + elif suffix in ("alism", "aliti", "alli"): + word = "".join((word[:-len(suffix)], "al")) + + if len(r1) >= len(suffix): + r1 = "".join((r1[:-len(suffix)], "al")) + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = "".join((r2[:-len(suffix)], "al")) + else: + r2 = "" + + elif suffix == "fulness": + word = word[:-4] + r1 = r1[:-4] + r2 = r2[:-4] + + elif suffix in ("ousli", "ousness"): + word = "".join((word[:-len(suffix)], "ous")) + + if len(r1) >= len(suffix): + r1 = "".join((r1[:-len(suffix)], "ous")) + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = "".join((r2[:-len(suffix)], "ous")) + else: + r2 = "" + + elif suffix in ("iveness", "iviti"): + word = "".join((word[:-len(suffix)], "ive")) + + if len(r1) >= len(suffix): + r1 = "".join((r1[:-len(suffix)], "ive")) + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = "".join((r2[:-len(suffix)], "ive")) + else: + r2 = "e" + + elif suffix in ("biliti", "bli"): + word = "".join((word[:-len(suffix)], "ble")) + + if len(r1) >= len(suffix): + r1 = "".join((r1[:-len(suffix)], "ble")) + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = "".join((r2[:-len(suffix)], "ble")) + else: + r2 = "" + + elif suffix == "ogi" and word[-4] == "l": + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + + elif suffix in ("fulli", "lessli"): + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix == "li" and word[-3] in self.__li_ending: + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + break + + # STEP 3 + for suffix in self.__step3_suffixes: + if word.endswith(suffix): + if r1.endswith(suffix): + if suffix == "tional": + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix == "ational": + word = "".join((word[:-len(suffix)], "ate")) + + if len(r1) >= len(suffix): + r1 = "".join((r1[:-len(suffix)], "ate")) + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = "".join((r2[:-len(suffix)], "ate")) + else: + r2 = "" + + elif suffix == "alize": + word = word[:-3] + r1 = r1[:-3] + r2 = r2[:-3] + + elif suffix in ("icate", "iciti", "ical"): + word = "".join((word[:-len(suffix)], "ic")) + + if len(r1) >= len(suffix): + r1 = "".join((r1[:-len(suffix)], "ic")) + else: + r1 = "" + + if len(r2) >= len(suffix): + r2 = "".join((r2[:-len(suffix)], "ic")) + else: + r2 = "" + + elif suffix in ("ful", "ness"): + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + + elif suffix == "ative" and r2.endswith(suffix): + word = word[:-5] + r1 = r1[:-5] + r2 = r2[:-5] + break + + # STEP 4 + for suffix in self.__step4_suffixes: + if word.endswith(suffix): + if r2.endswith(suffix): + if suffix == "ion": + if word[-4] in "st": + word = word[:-3] + r1 = r1[:-3] + r2 = r2[:-3] + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + break + + # STEP 5 + if r2.endswith("l") and word[-2] == "l": + word = word[:-1] + elif r2.endswith("e"): + word = word[:-1] + elif r1.endswith("e"): + if len(word) >= 4 and (word[-2] in self.__vowels or + word[-2] in "wxY" or + word[-3] not in self.__vowels or + word[-4] in self.__vowels): + word = word[:-1] + + + word = word.replace("Y", "y") + + + return word + + + +class FinnishStemmer(_StandardStemmer): + + """ + The Finnish Snowball stemmer. + + :cvar __vowels: The Finnish vowels. + :type __vowels: unicode + :cvar __restricted_vowels: A subset of the Finnish vowels. + :type __restricted_vowels: unicode + :cvar __long_vowels: The Finnish vowels in their long forms. + :type __long_vowels: tuple + :cvar __consonants: The Finnish consonants. + :type __consonants: unicode + :cvar __double_consonants: The Finnish double consonants. + :type __double_consonants: tuple + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. + :type __step4_suffixes: tuple + :note: A detailed description of the Finnish + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/finnish/stemmer.html + """ + + __vowels = "aeiouy\xE4\xF6" + __restricted_vowels = "aeiou\xE4\xF6" + __long_vowels = ("aa", "ee", "ii", "oo", "uu", "\xE4\xE4", + "\xF6\xF6") + __consonants = "bcdfghjklmnpqrstvwxz" + __double_consonants = ("bb", "cc", "dd", "ff", "gg", "hh", "jj", + "kk", "ll", "mm", "nn", "pp", "qq", "rr", + "ss", "tt", "vv", "ww", "xx", "zz") + __step1_suffixes = ('kaan', 'k\xE4\xE4n', 'sti', 'kin', 'han', + 'h\xE4n', 'ko', 'k\xF6', 'pa', 'p\xE4') + __step2_suffixes = ('nsa', 'ns\xE4', 'mme', 'nne', 'si', 'ni', + 'an', '\xE4n', 'en') + __step3_suffixes = ('siin', 'tten', 'seen', 'han', 'hen', 'hin', + 'hon', 'h\xE4n', 'h\xF6n', 'den', 'tta', + 'tt\xE4', 'ssa', 'ss\xE4', 'sta', + 'st\xE4', 'lla', 'll\xE4', 'lta', + 'lt\xE4', 'lle', 'ksi', 'ine', 'ta', + 't\xE4', 'na', 'n\xE4', 'a', '\xE4', + 'n') + __step4_suffixes = ('impi', 'impa', 'imp\xE4', 'immi', 'imma', + 'imm\xE4', 'mpi', 'mpa', 'mp\xE4', 'mmi', + 'mma', 'mm\xE4', 'eja', 'ej\xE4') + + def stem(self, word): + """ + Stem a Finnish word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + step3_success = False + + r1, r2 = self._r1r2_standard(word, self.__vowels) + + # STEP 1: Particles etc. + for suffix in self.__step1_suffixes: + if r1.endswith(suffix): + if suffix == "sti": + if suffix in r2: + word = word[:-3] + r1 = r1[:-3] + r2 = r2[:-3] + else: + if word[-len(suffix)-1] in "ntaeiouy\xE4\xF6": + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + break + + # STEP 2: Possessives + for suffix in self.__step2_suffixes: + if r1.endswith(suffix): + if suffix == "si": + if word[-3] != "k": + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix == "ni": + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + if word.endswith("kse"): + word = "".join((word[:-3], "ksi")) + + if r1.endswith("kse"): + r1 = "".join((r1[:-3], "ksi")) + + if r2.endswith("kse"): + r2 = "".join((r2[:-3], "ksi")) + + elif suffix == "an": + if (word[-4:-2] in ("ta", "na") or + word[-5:-2] in ("ssa", "sta", "lla", "lta")): + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix == "\xE4n": + if (word[-4:-2] in ("t\xE4", "n\xE4") or + word[-5:-2] in ("ss\xE4", "st\xE4", + "ll\xE4", "lt\xE4")): + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + + elif suffix == "en": + if word[-5:-2] in ("lle", "ine"): + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + else: + word = word[:-3] + r1 = r1[:-3] + r2 = r2[:-3] + break + + # STEP 3: Cases + for suffix in self.__step3_suffixes: + if r1.endswith(suffix): + if suffix in ("han", "hen", "hin", "hon", "h\xE4n", + "h\xF6n"): + if ((suffix == "han" and word[-4] == "a") or + (suffix == "hen" and word[-4] == "e") or + (suffix == "hin" and word[-4] == "i") or + (suffix == "hon" and word[-4] == "o") or + (suffix == "h\xE4n" and word[-4] == "\xE4") or + (suffix == "h\xF6n" and word[-4] == "\xF6")): + word = word[:-3] + r1 = r1[:-3] + r2 = r2[:-3] + step3_success = True + + elif suffix in ("siin", "den", "tten"): + if (word[-len(suffix)-1] == "i" and + word[-len(suffix)-2] in self.__restricted_vowels): + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + step3_success = True + else: + continue + + elif suffix == "seen": + if word[-6:-4] in self.__long_vowels: + word = word[:-4] + r1 = r1[:-4] + r2 = r2[:-4] + step3_success = True + else: + continue + + elif suffix in ("a", "\xE4"): + if word[-2] in self.__vowels and word[-3] in self.__consonants: + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + step3_success = True + + elif suffix in ("tta", "tt\xE4"): + if word[-4] == "e": + word = word[:-3] + r1 = r1[:-3] + r2 = r2[:-3] + step3_success = True + + elif suffix == "n": + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + step3_success = True + + if word[-2:] == "ie" or word[-2:] in self.__long_vowels: + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + step3_success = True + break + + # STEP 4: Other endings + for suffix in self.__step4_suffixes: + if r2.endswith(suffix): + if suffix in ("mpi", "mpa", "mp\xE4", "mmi", "mma", + "mm\xE4"): + if word[-5:-3] != "po": + word = word[:-3] + r1 = r1[:-3] + r2 = r2[:-3] + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + break + + # STEP 5: Plurals + if step3_success and len(r1) >= 1 and r1[-1] in "ij": + word = word[:-1] + r1 = r1[:-1] + + elif (not step3_success and len(r1) >= 2 and + r1[-1] == "t" and r1[-2] in self.__vowels): + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + if r2.endswith("imma"): + word = word[:-4] + r1 = r1[:-4] + elif r2.endswith("mma") and r2[-5:-3] != "po": + word = word[:-3] + r1 = r1[:-3] + + # STEP 6: Tidying up + if r1[-2:] in self.__long_vowels: + word = word[:-1] + r1 = r1[:-1] + + if (len(r1) >= 2 and r1[-2] in self.__consonants and + r1[-1] in "a\xE4ei"): + word = word[:-1] + r1 = r1[:-1] + + if r1.endswith(("oj", "uj")): + word = word[:-1] + r1 = r1[:-1] + + if r1.endswith("jo"): + word = word[:-1] + r1 = r1[:-1] + + # If the word ends with a double consonant + # followed by zero or more vowels, the last consonant is removed. + for i in range(1, len(word)): + if word[-i] in self.__vowels: + continue + else: + if i == 1: + if word[-i-1:] in self.__double_consonants: + word = word[:-1] + else: + if word[-i-1:-i+1] in self.__double_consonants: + word = "".join((word[:-i], word[-i+1:])) + break + + + return word + + + +class FrenchStemmer(_StandardStemmer): + + """ + The French Snowball stemmer. + + :cvar __vowels: The French vowels. + :type __vowels: unicode + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm. + :type __step2a_suffixes: tuple + :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm. + :type __step2b_suffixes: tuple + :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. + :type __step4_suffixes: tuple + :note: A detailed description of the French + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/french/stemmer.html + """ + + __vowels = "aeiouy\xE2\xE0\xEB\xE9\xEA\xE8\xEF\xEE\xF4\xFB\xF9" + __step1_suffixes = ('issements', 'issement', 'atrices', 'atrice', + 'ateurs', 'ations', 'logies', 'usions', + 'utions', 'ements', 'amment', 'emment', + 'ances', 'iqUes', 'ismes', 'ables', 'istes', + 'ateur', 'ation', 'logie', 'usion', 'ution', + 'ences', 'ement', 'euses', 'ments', 'ance', + 'iqUe', 'isme', 'able', 'iste', 'ence', + 'it\xE9s', 'ives', 'eaux', 'euse', 'ment', + 'eux', 'it\xE9', 'ive', 'ifs', 'aux', 'if') + __step2a_suffixes = ('issaIent', 'issantes', 'iraIent', 'issante', + 'issants', 'issions', 'irions', 'issais', + 'issait', 'issant', 'issent', 'issiez', 'issons', + 'irais', 'irait', 'irent', 'iriez', 'irons', + 'iront', 'isses', 'issez', '\xEEmes', + '\xEEtes', 'irai', 'iras', 'irez', 'isse', + 'ies', 'ira', '\xEEt', 'ie', 'ir', 'is', + 'it', 'i') + __step2b_suffixes = ('eraIent', 'assions', 'erions', 'assent', + 'assiez', '\xE8rent', 'erais', 'erait', + 'eriez', 'erons', 'eront', 'aIent', 'antes', + 'asses', 'ions', 'erai', 'eras', 'erez', + '\xE2mes', '\xE2tes', 'ante', 'ants', + 'asse', '\xE9es', 'era', 'iez', 'ais', + 'ait', 'ant', '\xE9e', '\xE9s', 'er', + 'ez', '\xE2t', 'ai', 'as', '\xE9', 'a') + __step4_suffixes = ('i\xE8re', 'I\xE8re', 'ion', 'ier', 'Ier', + 'e', '\xEB') + + def stem(self, word): + """ + Stem a French word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + step1_success = False + rv_ending_found = False + step2a_success = False + step2b_success = False + + # Every occurrence of 'u' after 'q' is put into upper case. + for i in range(1, len(word)): + if word[i-1] == "q" and word[i] == "u": + word = "".join((word[:i], "U", word[i+1:])) + + # Every occurrence of 'u' and 'i' + # between vowels is put into upper case. + # Every occurrence of 'y' preceded or + # followed by a vowel is also put into upper case. + for i in range(1, len(word)-1): + if word[i-1] in self.__vowels and word[i+1] in self.__vowels: + if word[i] == "u": + word = "".join((word[:i], "U", word[i+1:])) + + elif word[i] == "i": + word = "".join((word[:i], "I", word[i+1:])) + + if word[i-1] in self.__vowels or word[i+1] in self.__vowels: + if word[i] == "y": + word = "".join((word[:i], "Y", word[i+1:])) + + r1, r2 = self._r1r2_standard(word, self.__vowels) + rv = self.__rv_french(word, self.__vowels) + + # STEP 1: Standard suffix removal + for suffix in self.__step1_suffixes: + if word.endswith(suffix): + if suffix == "eaux": + word = word[:-1] + step1_success = True + + elif suffix in ("euse", "euses"): + if suffix in r2: + word = word[:-len(suffix)] + step1_success = True + + elif suffix in r1: + word = "".join((word[:-len(suffix)], "eux")) + step1_success = True + + elif suffix in ("ement", "ements") and suffix in rv: + word = word[:-len(suffix)] + step1_success = True + + if word[-2:] == "iv" and "iv" in r2: + word = word[:-2] + + if word[-2:] == "at" and "at" in r2: + word = word[:-2] + + elif word[-3:] == "eus": + if "eus" in r2: + word = word[:-3] + elif "eus" in r1: + word = "".join((word[:-1], "x")) + + elif word[-3:] in ("abl", "iqU"): + if "abl" in r2 or "iqU" in r2: + word = word[:-3] + + elif word[-3:] in ("i\xE8r", "I\xE8r"): + if "i\xE8r" in rv or "I\xE8r" in rv: + word = "".join((word[:-3], "i")) + + elif suffix == "amment" and suffix in rv: + word = "".join((word[:-6], "ant")) + rv = "".join((rv[:-6], "ant")) + rv_ending_found = True + + elif suffix == "emment" and suffix in rv: + word = "".join((word[:-6], "ent")) + rv_ending_found = True + + elif (suffix in ("ment", "ments") and suffix in rv and + not rv.startswith(suffix) and + rv[rv.rindex(suffix)-1] in self.__vowels): + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + rv_ending_found = True + + elif suffix == "aux" and suffix in r1: + word = "".join((word[:-2], "l")) + step1_success = True + + elif (suffix in ("issement", "issements") and suffix in r1 + and word[-len(suffix)-1] not in self.__vowels): + word = word[:-len(suffix)] + step1_success = True + + elif suffix in ("ance", "iqUe", "isme", "able", "iste", + "eux", "ances", "iqUes", "ismes", + "ables", "istes") and suffix in r2: + word = word[:-len(suffix)] + step1_success = True + + elif suffix in ("atrice", "ateur", "ation", "atrices", + "ateurs", "ations") and suffix in r2: + word = word[:-len(suffix)] + step1_success = True + + if word[-2:] == "ic": + if "ic" in r2: + word = word[:-2] + else: + word = "".join((word[:-2], "iqU")) + + elif suffix in ("logie", "logies") and suffix in r2: + word = "".join((word[:-len(suffix)], "log")) + step1_success = True + + elif (suffix in ("usion", "ution", "usions", "utions") and + suffix in r2): + word = "".join((word[:-len(suffix)], "u")) + step1_success = True + + elif suffix in ("ence", "ences") and suffix in r2: + word = "".join((word[:-len(suffix)], "ent")) + step1_success = True + + elif suffix in ("it\xE9", "it\xE9s") and suffix in r2: + word = word[:-len(suffix)] + step1_success = True + + if word[-4:] == "abil": + if "abil" in r2: + word = word[:-4] + else: + word = "".join((word[:-2], "l")) + + elif word[-2:] == "ic": + if "ic" in r2: + word = word[:-2] + else: + word = "".join((word[:-2], "iqU")) + + elif word[-2:] == "iv": + if "iv" in r2: + word = word[:-2] + + elif (suffix in ("if", "ive", "ifs", "ives") and + suffix in r2): + word = word[:-len(suffix)] + step1_success = True + + if word[-2:] == "at" and "at" in r2: + word = word[:-2] + + if word[-2:] == "ic": + if "ic" in r2: + word = word[:-2] + else: + word = "".join((word[:-2], "iqU")) + break + + # STEP 2a: Verb suffixes beginning 'i' + if not step1_success or rv_ending_found: + for suffix in self.__step2a_suffixes: + if word.endswith(suffix): + if (suffix in rv and len(rv) > len(suffix) and + rv[rv.rindex(suffix)-1] not in self.__vowels): + word = word[:-len(suffix)] + step2a_success = True + break + + # STEP 2b: Other verb suffixes + if not step2a_success: + for suffix in self.__step2b_suffixes: + if rv.endswith(suffix): + if suffix == "ions" and "ions" in r2: + word = word[:-4] + step2b_success = True + + elif suffix in ('eraIent', 'erions', '\xE8rent', + 'erais', 'erait', 'eriez', + 'erons', 'eront', 'erai', 'eras', + 'erez', '\xE9es', 'era', 'iez', + '\xE9e', '\xE9s', 'er', 'ez', + '\xE9'): + word = word[:-len(suffix)] + step2b_success = True + + elif suffix in ('assions', 'assent', 'assiez', + 'aIent', 'antes', 'asses', + '\xE2mes', '\xE2tes', 'ante', + 'ants', 'asse', 'ais', 'ait', + 'ant', '\xE2t', 'ai', 'as', + 'a'): + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + step2b_success = True + if rv.endswith("e"): + word = word[:-1] + break + + # STEP 3 + if step1_success or step2a_success or step2b_success: + if word[-1] == "Y": + word = "".join((word[:-1], "i")) + elif word[-1] == "\xE7": + word = "".join((word[:-1], "c")) + + # STEP 4: Residual suffixes + else: + if (len(word) >= 2 and word[-1] == "s" and + word[-2] not in "aiou\xE8s"): + word = word[:-1] + + for suffix in self.__step4_suffixes: + if word.endswith(suffix): + if suffix in rv: + if (suffix == "ion" and suffix in r2 and + rv[-4] in "st"): + word = word[:-3] + + elif suffix in ("ier", "i\xE8re", "Ier", + "I\xE8re"): + word = "".join((word[:-len(suffix)], "i")) + + elif suffix == "e": + word = word[:-1] + + elif suffix == "\xEB" and word[-3:-1] == "gu": + word = word[:-1] + break + + # STEP 5: Undouble + if word.endswith(("enn", "onn", "ett", "ell", "eill")): + word = word[:-1] + + # STEP 6: Un-accent + for i in range(1, len(word)): + if word[-i] not in self.__vowels: + i += 1 + else: + if i != 1 and word[-i] in ("\xE9", "\xE8"): + word = "".join((word[:-i], "e", word[-i+1:])) + break + + word = (word.replace("I", "i") + .replace("U", "u") + .replace("Y", "y")) + + + return word + + + + def __rv_french(self, word, vowels): + """ + Return the region RV that is used by the French stemmer. + + If the word begins with two vowels, RV is the region after + the third letter. Otherwise, it is the region after the first + vowel not at the beginning of the word, or the end of the word + if these positions cannot be found. (Exceptionally, u'par', + u'col' or u'tap' at the beginning of a word is also taken to + define RV as the region to their right.) + + :param word: The French word whose region RV is determined. + :type word: str or unicode + :param vowels: The French vowels that are used to determine + the region RV. + :type vowels: unicode + :return: the region RV for the respective French word. + :rtype: unicode + :note: This helper method is invoked by the stem method of + the subclass FrenchStemmer. It is not to be invoked directly! + + """ + rv = "" + if len(word) >= 2: + if (word.startswith(("par", "col", "tap")) or + (word[0] in vowels and word[1] in vowels)): + rv = word[3:] + else: + for i in range(1, len(word)): + if word[i] in vowels: + rv = word[i+1:] + break + + return rv + + + +class GermanStemmer(_StandardStemmer): + + """ + The German Snowball stemmer. + + :cvar __vowels: The German vowels. + :type __vowels: unicode + :cvar __s_ending: Letters that may directly appear before a word final 's'. + :type __s_ending: unicode + :cvar __st_ending: Letter that may directly appear before a word final 'st'. + :type __st_ending: unicode + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :note: A detailed description of the German + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/german/stemmer.html + + """ + + __vowels = "aeiouy\xE4\xF6\xFC" + __s_ending = "bdfghklmnrt" + __st_ending = "bdfghklmnt" + + __step1_suffixes = ("ern", "em", "er", "en", "es", "e", "s") + __step2_suffixes = ("est", "en", "er", "st") + __step3_suffixes = ("isch", "lich", "heit", "keit", + "end", "ung", "ig", "ik") + + def stem(self, word): + """ + Stem a German word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + word = word.replace("\xDF", "ss") + + # Every occurrence of 'u' and 'y' + # between vowels is put into upper case. + for i in range(1, len(word)-1): + if word[i-1] in self.__vowels and word[i+1] in self.__vowels: + if word[i] == "u": + word = "".join((word[:i], "U", word[i+1:])) + + elif word[i] == "y": + word = "".join((word[:i], "Y", word[i+1:])) + + r1, r2 = self._r1r2_standard(word, self.__vowels) + + # R1 is adjusted so that the region before it + # contains at least 3 letters. + for i in range(1, len(word)): + if word[i] not in self.__vowels and word[i-1] in self.__vowels: + if len(word[:i+1]) < 3 and len(word[:i+1]) > 0: + r1 = word[3:] + elif len(word[:i+1]) == 0: + return word + break + + # STEP 1 + for suffix in self.__step1_suffixes: + if r1.endswith(suffix): + if (suffix in ("en", "es", "e") and + word[-len(suffix)-4:-len(suffix)] == "niss"): + word = word[:-len(suffix)-1] + r1 = r1[:-len(suffix)-1] + r2 = r2[:-len(suffix)-1] + + elif suffix == "s": + if word[-2] in self.__s_ending: + word = word[:-1] + r1 = r1[:-1] + r2 = r2[:-1] + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + break + + # STEP 2 + for suffix in self.__step2_suffixes: + if r1.endswith(suffix): + if suffix == "st": + if word[-3] in self.__st_ending and len(word[:-3]) >= 3: + word = word[:-2] + r1 = r1[:-2] + r2 = r2[:-2] + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + break + + # STEP 3: Derivational suffixes + for suffix in self.__step3_suffixes: + if r2.endswith(suffix): + if suffix in ("end", "ung"): + if ("ig" in r2[-len(suffix)-2:-len(suffix)] and + "e" not in r2[-len(suffix)-3:-len(suffix)-2]): + word = word[:-len(suffix)-2] + else: + word = word[:-len(suffix)] + + elif (suffix in ("ig", "ik", "isch") and + "e" not in r2[-len(suffix)-1:-len(suffix)]): + word = word[:-len(suffix)] + + elif suffix in ("lich", "heit"): + if ("er" in r1[-len(suffix)-2:-len(suffix)] or + "en" in r1[-len(suffix)-2:-len(suffix)]): + word = word[:-len(suffix)-2] + else: + word = word[:-len(suffix)] + + elif suffix == "keit": + if "lich" in r2[-len(suffix)-4:-len(suffix)]: + word = word[:-len(suffix)-4] + + elif "ig" in r2[-len(suffix)-2:-len(suffix)]: + word = word[:-len(suffix)-2] + else: + word = word[:-len(suffix)] + break + + # Umlaut accents are removed and + # 'u' and 'y' are put back into lower case. + word = (word.replace("\xE4", "a").replace("\xF6", "o") + .replace("\xFC", "u").replace("U", "u") + .replace("Y", "y")) + + + return word + + + +class HungarianStemmer(_LanguageSpecificStemmer): + + """ + The Hungarian Snowball stemmer. + + :cvar __vowels: The Hungarian vowels. + :type __vowels: unicode + :cvar __digraphs: The Hungarian digraphs. + :type __digraphs: tuple + :cvar __double_consonants: The Hungarian double consonants. + :type __double_consonants: tuple + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. + :type __step4_suffixes: tuple + :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm. + :type __step5_suffixes: tuple + :cvar __step6_suffixes: Suffixes to be deleted in step 6 of the algorithm. + :type __step6_suffixes: tuple + :cvar __step7_suffixes: Suffixes to be deleted in step 7 of the algorithm. + :type __step7_suffixes: tuple + :cvar __step8_suffixes: Suffixes to be deleted in step 8 of the algorithm. + :type __step8_suffixes: tuple + :cvar __step9_suffixes: Suffixes to be deleted in step 9 of the algorithm. + :type __step9_suffixes: tuple + :note: A detailed description of the Hungarian + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/hungarian/stemmer.html + + """ + + __vowels = "aeiou\xF6\xFC\xE1\xE9\xED\xF3\xF5\xFA\xFB" + __digraphs = ("cs", "dz", "dzs", "gy", "ly", "ny", "ty", "zs") + __double_consonants = ("bb", "cc", "ccs", "dd", "ff", "gg", + "ggy", "jj", "kk", "ll", "lly", "mm", + "nn", "nny", "pp", "rr", "ss", "ssz", + "tt", "tty", "vv", "zz", "zzs") + + __step1_suffixes = ("al", "el") + __step2_suffixes = ('k\xE9ppen', 'onk\xE9nt', 'enk\xE9nt', + 'ank\xE9nt', 'k\xE9pp', 'k\xE9nt', 'ban', + 'ben', 'nak', 'nek', 'val', 'vel', 't\xF3l', + 't\xF5l', 'r\xF3l', 'r\xF5l', 'b\xF3l', + 'b\xF5l', 'hoz', 'hez', 'h\xF6z', + 'n\xE1l', 'n\xE9l', '\xE9rt', 'kor', + 'ba', 'be', 'ra', 're', 'ig', 'at', 'et', + 'ot', '\xF6t', 'ul', '\xFCl', 'v\xE1', + 'v\xE9', 'en', 'on', 'an', '\xF6n', + 'n', 't') + __step3_suffixes = ("\xE1nk\xE9nt", "\xE1n", "\xE9n") + __step4_suffixes = ('astul', 'est\xFCl', '\xE1stul', + '\xE9st\xFCl', 'stul', 'st\xFCl') + __step5_suffixes = ("\xE1", "\xE9") + __step6_suffixes = ('ok\xE9', '\xF6k\xE9', 'ak\xE9', + 'ek\xE9', '\xE1k\xE9', '\xE1\xE9i', + '\xE9k\xE9', '\xE9\xE9i', 'k\xE9', + '\xE9i', '\xE9\xE9', '\xE9') + __step7_suffixes = ('\xE1juk', '\xE9j\xFCk', '\xFCnk', + 'unk', 'juk', 'j\xFCk', '\xE1nk', + '\xE9nk', 'nk', 'uk', '\xFCk', 'em', + 'om', 'am', 'od', 'ed', 'ad', '\xF6d', + 'ja', 'je', '\xE1m', '\xE1d', '\xE9m', + '\xE9d', 'm', 'd', 'a', 'e', 'o', + '\xE1', '\xE9') + __step8_suffixes = ('jaitok', 'jeitek', 'jaink', 'jeink', 'aitok', + 'eitek', '\xE1itok', '\xE9itek', 'jaim', + 'jeim', 'jaid', 'jeid', 'eink', 'aink', + 'itek', 'jeik', 'jaik', '\xE1ink', + '\xE9ink', 'aim', 'eim', 'aid', 'eid', + 'jai', 'jei', 'ink', 'aik', 'eik', + '\xE1im', '\xE1id', '\xE1ik', '\xE9im', + '\xE9id', '\xE9ik', 'im', 'id', 'ai', + 'ei', 'ik', '\xE1i', '\xE9i', 'i') + __step9_suffixes = ("\xE1k", "\xE9k", "\xF6k", "ok", + "ek", "ak", "k") + + def stem(self, word): + """ + Stem an Hungarian word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + r1 = self.__r1_hungarian(word, self.__vowels, self.__digraphs) + + # STEP 1: Remove instrumental case + if r1.endswith(self.__step1_suffixes): + for double_cons in self.__double_consonants: + if word[-2-len(double_cons):-2] == double_cons: + word = "".join((word[:-4], word[-3])) + + if r1[-2-len(double_cons):-2] == double_cons: + r1 = "".join((r1[:-4], r1[-3])) + break + + # STEP 2: Remove frequent cases + for suffix in self.__step2_suffixes: + if word.endswith(suffix): + if r1.endswith(suffix): + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + + if r1.endswith("\xE1"): + word = "".join((word[:-1], "a")) + r1 = "".join((r1[:-1], "a")) + + elif r1.endswith("\xE9"): + word = "".join((word[:-1], "e")) + r1 = "".join((r1[:-1], "e")) + break + + # STEP 3: Remove special cases + for suffix in self.__step3_suffixes: + if r1.endswith(suffix): + if suffix == "\xE9n": + word = "".join((word[:-2], "e")) + r1 = "".join((r1[:-2], "e")) + else: + word = "".join((word[:-len(suffix)], "a")) + r1 = "".join((r1[:-len(suffix)], "a")) + break + + # STEP 4: Remove other cases + for suffix in self.__step4_suffixes: + if r1.endswith(suffix): + if suffix == "\xE1stul": + word = "".join((word[:-5], "a")) + r1 = "".join((r1[:-5], "a")) + + elif suffix == "\xE9st\xFCl": + word = "".join((word[:-5], "e")) + r1 = "".join((r1[:-5], "e")) + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + break + + # STEP 5: Remove factive case + for suffix in self.__step5_suffixes: + if r1.endswith(suffix): + for double_cons in self.__double_consonants: + if word[-1-len(double_cons):-1] == double_cons: + word = "".join((word[:-3], word[-2])) + + if r1[-1-len(double_cons):-1] == double_cons: + r1 = "".join((r1[:-3], r1[-2])) + break + + # STEP 6: Remove owned + for suffix in self.__step6_suffixes: + if r1.endswith(suffix): + if suffix in ("\xE1k\xE9", "\xE1\xE9i"): + word = "".join((word[:-3], "a")) + r1 = "".join((r1[:-3], "a")) + + elif suffix in ("\xE9k\xE9", "\xE9\xE9i", + "\xE9\xE9"): + word = "".join((word[:-len(suffix)], "e")) + r1 = "".join((r1[:-len(suffix)], "e")) + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + break + + # STEP 7: Remove singular owner suffixes + for suffix in self.__step7_suffixes: + if word.endswith(suffix): + if r1.endswith(suffix): + if suffix in ("\xE1nk", "\xE1juk", "\xE1m", + "\xE1d", "\xE1"): + word = "".join((word[:-len(suffix)], "a")) + r1 = "".join((r1[:-len(suffix)], "a")) + + elif suffix in ("\xE9nk", "\xE9j\xFCk", + "\xE9m", "\xE9d", "\xE9"): + word = "".join((word[:-len(suffix)], "e")) + r1 = "".join((r1[:-len(suffix)], "e")) + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + break + + # STEP 8: Remove plural owner suffixes + for suffix in self.__step8_suffixes: + if word.endswith(suffix): + if r1.endswith(suffix): + if suffix in ("\xE1im", "\xE1id", "\xE1i", + "\xE1ink", "\xE1itok", "\xE1ik"): + word = "".join((word[:-len(suffix)], "a")) + r1 = "".join((r1[:-len(suffix)], "a")) + + elif suffix in ("\xE9im", "\xE9id", "\xE9i", + "\xE9ink", "\xE9itek", "\xE9ik"): + word = "".join((word[:-len(suffix)], "e")) + r1 = "".join((r1[:-len(suffix)], "e")) + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + break + + # STEP 9: Remove plural suffixes + for suffix in self.__step9_suffixes: + if word.endswith(suffix): + if r1.endswith(suffix): + if suffix == "\xE1k": + word = "".join((word[:-2], "a")) + elif suffix == "\xE9k": + word = "".join((word[:-2], "e")) + else: + word = word[:-len(suffix)] + break + + + return word + + + + def __r1_hungarian(self, word, vowels, digraphs): + """ + Return the region R1 that is used by the Hungarian stemmer. + + If the word begins with a vowel, R1 is defined as the region + after the first consonant or digraph (= two letters stand for + one phoneme) in the word. If the word begins with a consonant, + it is defined as the region after the first vowel in the word. + If the word does not contain both a vowel and consonant, R1 + is the null region at the end of the word. + + :param word: The Hungarian word whose region R1 is determined. + :type word: str or unicode + :param vowels: The Hungarian vowels that are used to determine + the region R1. + :type vowels: unicode + :param digraphs: The digraphs that are used to determine the + region R1. + :type digraphs: tuple + :return: the region R1 for the respective word. + :rtype: unicode + :note: This helper method is invoked by the stem method of the subclass + HungarianStemmer. It is not to be invoked directly! + + """ + r1 = "" + if word[0] in vowels: + for digraph in digraphs: + if digraph in word[1:]: + r1 = word[word.index(digraph[-1])+1:] + return r1 + + for i in range(1, len(word)): + if word[i] not in vowels: + r1 = word[i+1:] + break + else: + for i in range(1, len(word)): + if word[i] in vowels: + r1 = word[i+1:] + break + + return r1 + + + +class ItalianStemmer(_StandardStemmer): + + """ + The Italian Snowball stemmer. + + :cvar __vowels: The Italian vowels. + :type __vowels: unicode + :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. + :type __step0_suffixes: tuple + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :note: A detailed description of the Italian + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/italian/stemmer.html + + """ + + __vowels = "aeiou\xE0\xE8\xEC\xF2\xF9" + __step0_suffixes = ('gliela', 'gliele', 'glieli', 'glielo', + 'gliene', 'sene', 'mela', 'mele', 'meli', + 'melo', 'mene', 'tela', 'tele', 'teli', + 'telo', 'tene', 'cela', 'cele', 'celi', + 'celo', 'cene', 'vela', 'vele', 'veli', + 'velo', 'vene', 'gli', 'ci', 'la', 'le', + 'li', 'lo', 'mi', 'ne', 'si', 'ti', 'vi') + __step1_suffixes = ('atrice', 'atrici', 'azione', 'azioni', + 'uzione', 'uzioni', 'usione', 'usioni', + 'amento', 'amenti', 'imento', 'imenti', + 'amente', 'abile', 'abili', 'ibile', 'ibili', + 'mente', 'atore', 'atori', 'logia', 'logie', + 'anza', 'anze', 'iche', 'ichi', 'ismo', + 'ismi', 'ista', 'iste', 'isti', 'ist\xE0', + 'ist\xE8', 'ist\xEC', 'ante', 'anti', + 'enza', 'enze', 'ico', 'ici', 'ica', 'ice', + 'oso', 'osi', 'osa', 'ose', 'it\xE0', + 'ivo', 'ivi', 'iva', 'ive') + __step2_suffixes = ('erebbero', 'irebbero', 'assero', 'assimo', + 'eranno', 'erebbe', 'eremmo', 'ereste', + 'eresti', 'essero', 'iranno', 'irebbe', + 'iremmo', 'ireste', 'iresti', 'iscano', + 'iscono', 'issero', 'arono', 'avamo', 'avano', + 'avate', 'eremo', 'erete', 'erono', 'evamo', + 'evano', 'evate', 'iremo', 'irete', 'irono', + 'ivamo', 'ivano', 'ivate', 'ammo', 'ando', + 'asse', 'assi', 'emmo', 'enda', 'ende', + 'endi', 'endo', 'erai', 'erei', 'Yamo', + 'iamo', 'immo', 'irai', 'irei', 'isca', + 'isce', 'isci', 'isco', 'ano', 'are', 'ata', + 'ate', 'ati', 'ato', 'ava', 'avi', 'avo', + 'er\xE0', 'ere', 'er\xF2', 'ete', 'eva', + 'evi', 'evo', 'ir\xE0', 'ire', 'ir\xF2', + 'ita', 'ite', 'iti', 'ito', 'iva', 'ivi', + 'ivo', 'ono', 'uta', 'ute', 'uti', 'uto', + 'ar', 'ir') + + def stem(self, word): + """ + Stem an Italian word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + step1_success = False + + # All acute accents are replaced by grave accents. + word = (word.replace("\xE1", "\xE0") + .replace("\xE9", "\xE8") + .replace("\xED", "\xEC") + .replace("\xF3", "\xF2") + .replace("\xFA", "\xF9")) + + # Every occurrence of 'u' after 'q' + # is put into upper case. + for i in range(1, len(word)): + if word[i-1] == "q" and word[i] == "u": + word = "".join((word[:i], "U", word[i+1:])) + + # Every occurrence of 'u' and 'i' + # between vowels is put into upper case. + for i in range(1, len(word)-1): + if word[i-1] in self.__vowels and word[i+1] in self.__vowels: + if word[i] == "u": + word = "".join((word[:i], "U", word[i+1:])) + + elif word [i] == "i": + word = "".join((word[:i], "I", word[i+1:])) + + r1, r2 = self._r1r2_standard(word, self.__vowels) + rv = self._rv_standard(word, self.__vowels) + + # STEP 0: Attached pronoun + for suffix in self.__step0_suffixes: + if rv.endswith(suffix): + if rv[-len(suffix)-4:-len(suffix)] in ("ando", "endo"): + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + + elif (rv[-len(suffix)-2:-len(suffix)] in + ("ar", "er", "ir")): + word = "".join((word[:-len(suffix)], "e")) + r1 = "".join((r1[:-len(suffix)], "e")) + r2 = "".join((r2[:-len(suffix)], "e")) + rv = "".join((rv[:-len(suffix)], "e")) + break + + # STEP 1: Standard suffix removal + for suffix in self.__step1_suffixes: + if word.endswith(suffix): + if suffix == "amente" and r1.endswith(suffix): + step1_success = True + word = word[:-6] + r2 = r2[:-6] + rv = rv[:-6] + + if r2.endswith("iv"): + word = word[:-2] + r2 = r2[:-2] + rv = rv[:-2] + + if r2.endswith("at"): + word = word[:-2] + rv = rv[:-2] + + elif r2.endswith(("os", "ic")): + word = word[:-2] + rv = rv[:-2] + + elif r2 .endswith("abil"): + word = word[:-4] + rv = rv[:-4] + + elif (suffix in ("amento", "amenti", + "imento", "imenti") and + rv.endswith(suffix)): + step1_success = True + word = word[:-6] + rv = rv[:-6] + + elif r2.endswith(suffix): + step1_success = True + if suffix in ("azione", "azioni", "atore", "atori"): + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + + if r2.endswith("ic"): + word = word[:-2] + rv = rv[:-2] + + elif suffix in ("logia", "logie"): + word = word[:-2] + rv = word[:-2] + + elif suffix in ("uzione", "uzioni", + "usione", "usioni"): + word = word[:-5] + rv = rv[:-5] + + elif suffix in ("enza", "enze"): + word = "".join((word[:-2], "te")) + rv = "".join((rv[:-2], "te")) + + elif suffix == "it\xE0": + word = word[:-3] + r2 = r2[:-3] + rv = rv[:-3] + + if r2.endswith(("ic", "iv")): + word = word[:-2] + rv = rv[:-2] + + elif r2.endswith("abil"): + word = word[:-4] + rv = rv[:-4] + + elif suffix in ("ivo", "ivi", "iva", "ive"): + word = word[:-3] + r2 = r2[:-3] + rv = rv[:-3] + + if r2.endswith("at"): + word = word[:-2] + r2 = r2[:-2] + rv = rv[:-2] + + if r2.endswith("ic"): + word = word[:-2] + rv = rv[:-2] + else: + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + break + + # STEP 2: Verb suffixes + if not step1_success: + for suffix in self.__step2_suffixes: + if rv.endswith(suffix): + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + break + + # STEP 3a + if rv.endswith(("a", "e", "i", "o", "\xE0", "\xE8", + "\xEC", "\xF2")): + word = word[:-1] + rv = rv[:-1] + + if rv.endswith("i"): + word = word[:-1] + rv = rv[:-1] + + # STEP 3b + if rv.endswith(("ch", "gh")): + word = word[:-1] + + word = word.replace("I", "i").replace("U", "u") + + + return word + + + +class NorwegianStemmer(_ScandinavianStemmer): + + """ + The Norwegian Snowball stemmer. + + :cvar __vowels: The Norwegian vowels. + :type __vowels: unicode + :cvar __s_ending: Letters that may directly appear before a word final 's'. + :type __s_ending: unicode + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :note: A detailed description of the Norwegian + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/norwegian/stemmer.html + + """ + + __vowels = "aeiouy\xE6\xE5\xF8" + __s_ending = "bcdfghjlmnoprtvyz" + __step1_suffixes = ("hetenes", "hetene", "hetens", "heter", + "heten", "endes", "ande", "ende", "edes", + "enes", "erte", "ede", "ane", "ene", "ens", + "ers", "ets", "het", "ast", "ert", "en", + "ar", "er", "as", "es", "et", "a", "e", "s") + + __step2_suffixes = ("dt", "vt") + + __step3_suffixes = ("hetslov", "eleg", "elig", "elov", "slov", + "leg", "eig", "lig", "els", "lov", "ig") + + def stem(self, word): + """ + Stem a Norwegian word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + r1 = self._r1_scandinavian(word, self.__vowels) + + # STEP 1 + for suffix in self.__step1_suffixes: + if r1.endswith(suffix): + if suffix in ("erte", "ert"): + word = "".join((word[:-len(suffix)], "er")) + r1 = "".join((r1[:-len(suffix)], "er")) + + elif suffix == "s": + if (word[-2] in self.__s_ending or + (word[-2] == "k" and word[-3] not in self.__vowels)): + word = word[:-1] + r1 = r1[:-1] + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + break + + # STEP 2 + for suffix in self.__step2_suffixes: + if r1.endswith(suffix): + word = word[:-1] + r1 = r1[:-1] + break + + # STEP 3 + for suffix in self.__step3_suffixes: + if r1.endswith(suffix): + word = word[:-len(suffix)] + break + + + return word + + + +class PortugueseStemmer(_StandardStemmer): + + """ + The Portuguese Snowball stemmer. + + :cvar __vowels: The Portuguese vowels. + :type __vowels: unicode + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. + :type __step4_suffixes: tuple + :note: A detailed description of the Portuguese + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/portuguese/stemmer.html + + """ + + __vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xE2\xEA\xF4" + __step1_suffixes = ('amentos', 'imentos', 'uciones', 'amento', + 'imento', 'adoras', 'adores', 'a\xE7o~es', + 'log\xEDas', '\xEAncias', 'amente', + 'idades', 'ismos', 'istas', 'adora', + 'a\xE7a~o', 'antes', '\xE2ncia', + 'log\xEDa', 'uci\xF3n', '\xEAncia', + 'mente', 'idade', 'ezas', 'icos', 'icas', + 'ismo', '\xE1vel', '\xEDvel', 'ista', + 'osos', 'osas', 'ador', 'ante', 'ivas', + 'ivos', 'iras', 'eza', 'ico', 'ica', + 'oso', 'osa', 'iva', 'ivo', 'ira') + __step2_suffixes = ('ar\xEDamos', 'er\xEDamos', 'ir\xEDamos', + '\xE1ssemos', '\xEAssemos', '\xEDssemos', + 'ar\xEDeis', 'er\xEDeis', 'ir\xEDeis', + '\xE1sseis', '\xE9sseis', '\xEDsseis', + '\xE1ramos', '\xE9ramos', '\xEDramos', + '\xE1vamos', 'aremos', 'eremos', 'iremos', + 'ariam', 'eriam', 'iriam', 'assem', 'essem', + 'issem', 'ara~o', 'era~o', 'ira~o', 'arias', + 'erias', 'irias', 'ardes', 'erdes', 'irdes', + 'asses', 'esses', 'isses', 'astes', 'estes', + 'istes', '\xE1reis', 'areis', '\xE9reis', + 'ereis', '\xEDreis', 'ireis', '\xE1veis', + '\xEDamos', 'armos', 'ermos', 'irmos', + 'aria', 'eria', 'iria', 'asse', 'esse', + 'isse', 'aste', 'este', 'iste', 'arei', + 'erei', 'irei', 'aram', 'eram', 'iram', + 'avam', 'arem', 'erem', 'irem', + 'ando', 'endo', 'indo', 'adas', 'idas', + 'ar\xE1s', 'aras', 'er\xE1s', 'eras', + 'ir\xE1s', 'avas', 'ares', 'eres', 'ires', + '\xEDeis', 'ados', 'idos', '\xE1mos', + 'amos', 'emos', 'imos', 'iras', 'ada', 'ida', + 'ar\xE1', 'ara', 'er\xE1', 'era', + 'ir\xE1', 'ava', 'iam', 'ado', 'ido', + 'ias', 'ais', 'eis', 'ira', 'ia', 'ei', 'am', + 'em', 'ar', 'er', 'ir', 'as', + 'es', 'is', 'eu', 'iu', 'ou') + __step4_suffixes = ("os", "a", "i", "o", "\xE1", + "\xED", "\xF3") + + def stem(self, word): + """ + Stem a Portuguese word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + step1_success = False + step2_success = False + + word = (word.replace("\xE3", "a~") + .replace("\xF5", "o~")) + + r1, r2 = self._r1r2_standard(word, self.__vowels) + rv = self._rv_standard(word, self.__vowels) + + # STEP 1: Standard suffix removal + for suffix in self.__step1_suffixes: + if word.endswith(suffix): + if suffix == "amente" and r1.endswith(suffix): + step1_success = True + + word = word[:-6] + r2 = r2[:-6] + rv = rv[:-6] + + if r2.endswith("iv"): + word = word[:-2] + r2 = r2[:-2] + rv = rv[:-2] + + if r2.endswith("at"): + word = word[:-2] + rv = rv[:-2] + + elif r2.endswith(("os", "ic", "ad")): + word = word[:-2] + rv = rv[:-2] + + elif (suffix in ("ira", "iras") and rv.endswith(suffix) and + word[-len(suffix)-1:-len(suffix)] == "e"): + step1_success = True + + word = "".join((word[:-len(suffix)], "ir")) + rv = "".join((rv[:-len(suffix)], "ir")) + + elif r2.endswith(suffix): + step1_success = True + + if suffix in ("log\xEDa", "log\xEDas"): + word = word[:-2] + rv = rv[:-2] + + elif suffix in ("uci\xF3n", "uciones"): + word = "".join((word[:-len(suffix)], "u")) + rv = "".join((rv[:-len(suffix)], "u")) + + elif suffix in ("\xEAncia", "\xEAncias"): + word = "".join((word[:-len(suffix)], "ente")) + rv = "".join((rv[:-len(suffix)], "ente")) + + elif suffix == "mente": + word = word[:-5] + r2 = r2[:-5] + rv = rv[:-5] + + if r2.endswith(("ante", "avel", "\xEDvel")): + word = word[:-4] + rv = rv[:-4] + + elif suffix in ("idade", "idades"): + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + + if r2.endswith(("ic", "iv")): + word = word[:-2] + rv = rv[:-2] + + elif r2.endswith("abil"): + word = word[:-4] + rv = rv[:-4] + + elif suffix in ("iva", "ivo", "ivas", "ivos"): + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + + if r2.endswith("at"): + word = word[:-2] + rv = rv[:-2] + else: + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + break + + # STEP 2: Verb suffixes + if not step1_success: + for suffix in self.__step2_suffixes: + if rv.endswith(suffix): + step2_success = True + + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + break + + # STEP 3 + if step1_success or step2_success: + if rv.endswith("i") and word[-2] == "c": + word = word[:-1] + rv = rv[:-1] + + ### STEP 4: Residual suffix + if not step1_success and not step2_success: + for suffix in self.__step4_suffixes: + if rv.endswith(suffix): + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + break + + # STEP 5 + if rv.endswith(("e", "\xE9", "\xEA")): + word = word[:-1] + rv = rv[:-1] + + if ((word.endswith("gu") and rv.endswith("u")) or + (word.endswith("ci") and rv.endswith("i"))): + word = word[:-1] + + elif word.endswith("\xE7"): + word = "".join((word[:-1], "c")) + + word = word.replace("a~", "\xE3").replace("o~", "\xF5") + + + return word + + + +class RomanianStemmer(_StandardStemmer): + + """ + The Romanian Snowball stemmer. + + :cvar __vowels: The Romanian vowels. + :type __vowels: unicode + :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. + :type __step0_suffixes: tuple + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :note: A detailed description of the Romanian + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/romanian/stemmer.html + + """ + + __vowels = "aeiou\u0103\xE2\xEE" + __step0_suffixes = ('iilor', 'ului', 'elor', 'iile', 'ilor', + 'atei', 'a\u0163ie', 'a\u0163ia', 'aua', + 'ele', 'iua', 'iei', 'ile', 'ul', 'ea', + 'ii') + __step1_suffixes = ('abilitate', 'abilitati', 'abilit\u0103\u0163i', + 'ibilitate', 'abilit\u0103i', 'ivitate', + 'ivitati', 'ivit\u0103\u0163i', 'icitate', + 'icitati', 'icit\u0103\u0163i', 'icatori', + 'ivit\u0103i', 'icit\u0103i', 'icator', + 'a\u0163iune', 'atoare', '\u0103toare', + 'i\u0163iune', 'itoare', 'iciva', 'icive', + 'icivi', 'iciv\u0103', 'icala', 'icale', + 'icali', 'ical\u0103', 'ativa', 'ative', + 'ativi', 'ativ\u0103', 'atori', '\u0103tori', + 'itiva', 'itive', 'itivi', 'itiv\u0103', + 'itori', 'iciv', 'ical', 'ativ', 'ator', + '\u0103tor', 'itiv', 'itor') + __step2_suffixes = ('abila', 'abile', 'abili', 'abil\u0103', + 'ibila', 'ibile', 'ibili', 'ibil\u0103', + 'atori', 'itate', 'itati', 'it\u0103\u0163i', + 'abil', 'ibil', 'oasa', 'oas\u0103', 'oase', + 'anta', 'ante', 'anti', 'ant\u0103', 'ator', + 'it\u0103i', 'iune', 'iuni', 'isme', 'ista', + 'iste', 'isti', 'ist\u0103', 'i\u015Fti', + 'ata', 'at\u0103', 'ati', 'ate', 'uta', + 'ut\u0103', 'uti', 'ute', 'ita', 'it\u0103', + 'iti', 'ite', 'ica', 'ice', 'ici', 'ic\u0103', + 'osi', 'o\u015Fi', 'ant', 'iva', 'ive', 'ivi', + 'iv\u0103', 'ism', 'ist', 'at', 'ut', 'it', + 'ic', 'os', 'iv') + __step3_suffixes = ('seser\u0103\u0163i', 'aser\u0103\u0163i', + 'iser\u0103\u0163i', '\xE2ser\u0103\u0163i', + 'user\u0103\u0163i', 'seser\u0103m', + 'aser\u0103m', 'iser\u0103m', '\xE2ser\u0103m', + 'user\u0103m', 'ser\u0103\u0163i', 'sese\u015Fi', + 'seser\u0103', 'easc\u0103', 'ar\u0103\u0163i', + 'ur\u0103\u0163i', 'ir\u0103\u0163i', + '\xE2r\u0103\u0163i', 'ase\u015Fi', + 'aser\u0103', 'ise\u015Fi', 'iser\u0103', + '\xe2se\u015Fi', '\xE2ser\u0103', + 'use\u015Fi', 'user\u0103', 'ser\u0103m', + 'sesem', 'indu', '\xE2ndu', 'eaz\u0103', + 'e\u015Fti', 'e\u015Fte', '\u0103\u015Fti', + '\u0103\u015Fte', 'ea\u0163i', 'ia\u0163i', + 'ar\u0103m', 'ur\u0103m', 'ir\u0103m', + '\xE2r\u0103m', 'asem', 'isem', + '\xE2sem', 'usem', 'se\u015Fi', 'ser\u0103', + 'sese', 'are', 'ere', 'ire', '\xE2re', + 'ind', '\xE2nd', 'eze', 'ezi', 'esc', + '\u0103sc', 'eam', 'eai', 'eau', 'iam', + 'iai', 'iau', 'a\u015Fi', 'ar\u0103', + 'u\u015Fi', 'ur\u0103', 'i\u015Fi', 'ir\u0103', + '\xE2\u015Fi', '\xe2r\u0103', 'ase', + 'ise', '\xE2se', 'use', 'a\u0163i', + 'e\u0163i', 'i\u0163i', '\xe2\u0163i', 'sei', + 'ez', 'am', 'ai', 'au', 'ea', 'ia', 'ui', + '\xE2i', '\u0103m', 'em', 'im', '\xE2m', + 'se') + + def stem(self, word): + """ + Stem a Romanian word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + step1_success = False + step2_success = False + + for i in range(1, len(word)-1): + if word[i-1] in self.__vowels and word[i+1] in self.__vowels: + if word[i] == "u": + word = "".join((word[:i], "U", word[i+1:])) + + elif word[i] == "i": + word = "".join((word[:i], "I", word[i+1:])) + + r1, r2 = self._r1r2_standard(word, self.__vowels) + rv = self._rv_standard(word, self.__vowels) + + # STEP 0: Removal of plurals and other simplifications + for suffix in self.__step0_suffixes: + if word.endswith(suffix): + if suffix in r1: + if suffix in ("ul", "ului"): + word = word[:-len(suffix)] + + if suffix in rv: + rv = rv[:-len(suffix)] + else: + rv = "" + + elif (suffix == "aua" or suffix == "atei" or + (suffix == "ile" and word[-5:-3] != "ab")): + word = word[:-2] + + elif suffix in ("ea", "ele", "elor"): + word = "".join((word[:-len(suffix)], "e")) + + if suffix in rv: + rv = "".join((rv[:-len(suffix)], "e")) + else: + rv = "" + + elif suffix in ("ii", "iua", "iei", + "iile", "iilor", "ilor"): + word = "".join((word[:-len(suffix)], "i")) + + if suffix in rv: + rv = "".join((rv[:-len(suffix)], "i")) + else: + rv = "" + + elif suffix in ("a\u0163ie", "a\u0163ia"): + word = word[:-1] + break + + # STEP 1: Reduction of combining suffixes + while True: + + replacement_done = False + + for suffix in self.__step1_suffixes: + if word.endswith(suffix): + if suffix in r1: + step1_success = True + replacement_done = True + + if suffix in ("abilitate", "abilitati", + "abilit\u0103i", + "abilit\u0103\u0163i"): + word = "".join((word[:-len(suffix)], "abil")) + + elif suffix == "ibilitate": + word = word[:-5] + + elif suffix in ("ivitate", "ivitati", + "ivit\u0103i", + "ivit\u0103\u0163i"): + word = "".join((word[:-len(suffix)], "iv")) + + elif suffix in ("icitate", "icitati", "icit\u0103i", + "icit\u0103\u0163i", "icator", + "icatori", "iciv", "iciva", + "icive", "icivi", "iciv\u0103", + "ical", "icala", "icale", "icali", + "ical\u0103"): + word = "".join((word[:-len(suffix)], "ic")) + + elif suffix in ("ativ", "ativa", "ative", "ativi", + "ativ\u0103", "a\u0163iune", + "atoare", "ator", "atori", + "\u0103toare", + "\u0103tor", "\u0103tori"): + word = "".join((word[:-len(suffix)], "at")) + + if suffix in r2: + r2 = "".join((r2[:-len(suffix)], "at")) + + elif suffix in ("itiv", "itiva", "itive", "itivi", + "itiv\u0103", "i\u0163iune", + "itoare", "itor", "itori"): + word = "".join((word[:-len(suffix)], "it")) + + if suffix in r2: + r2 = "".join((r2[:-len(suffix)], "it")) + else: + step1_success = False + break + + if not replacement_done: + break + + # STEP 2: Removal of standard suffixes + for suffix in self.__step2_suffixes: + if word.endswith(suffix): + if suffix in r2: + step2_success = True + + if suffix in ("iune", "iuni"): + if word[-5] == "\u0163": + word = "".join((word[:-5], "t")) + + elif suffix in ("ism", "isme", "ist", "ista", "iste", + "isti", "ist\u0103", "i\u015Fti"): + word = "".join((word[:-len(suffix)], "ist")) + + else: + word = word[:-len(suffix)] + break + + # STEP 3: Removal of verb suffixes + if not step1_success and not step2_success: + for suffix in self.__step3_suffixes: + if word.endswith(suffix): + if suffix in rv: + if suffix in ('seser\u0103\u0163i', 'seser\u0103m', + 'ser\u0103\u0163i', 'sese\u015Fi', + 'seser\u0103', 'ser\u0103m', 'sesem', + 'se\u015Fi', 'ser\u0103', 'sese', + 'a\u0163i', 'e\u0163i', 'i\u0163i', + '\xE2\u0163i', 'sei', '\u0103m', + 'em', 'im', '\xE2m', 'se'): + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + else: + if (not rv.startswith(suffix) and + rv[rv.index(suffix)-1] not in + "aeio\u0103\xE2\xEE"): + word = word[:-len(suffix)] + break + + # STEP 4: Removal of final vowel + for suffix in ("ie", "a", "e", "i", "\u0103"): + if word.endswith(suffix): + if suffix in rv: + word = word[:-len(suffix)] + break + + word = word.replace("I", "i").replace("U", "u") + + + return word + + + +class RussianStemmer(_LanguageSpecificStemmer): + + """ + The Russian Snowball stemmer. + + :cvar __perfective_gerund_suffixes: Suffixes to be deleted. + :type __perfective_gerund_suffixes: tuple + :cvar __adjectival_suffixes: Suffixes to be deleted. + :type __adjectival_suffixes: tuple + :cvar __reflexive_suffixes: Suffixes to be deleted. + :type __reflexive_suffixes: tuple + :cvar __verb_suffixes: Suffixes to be deleted. + :type __verb_suffixes: tuple + :cvar __noun_suffixes: Suffixes to be deleted. + :type __noun_suffixes: tuple + :cvar __superlative_suffixes: Suffixes to be deleted. + :type __superlative_suffixes: tuple + :cvar __derivational_suffixes: Suffixes to be deleted. + :type __derivational_suffixes: tuple + :note: A detailed description of the Russian + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/russian/stemmer.html + + """ + + __perfective_gerund_suffixes = ("ivshis'", "yvshis'", "vshis'", + "ivshi", "yvshi", "vshi", "iv", + "yv", "v") + __adjectival_suffixes = ('ui^ushchi^ui^u', 'ui^ushchi^ai^a', + 'ui^ushchimi', 'ui^ushchymi', 'ui^ushchego', + 'ui^ushchogo', 'ui^ushchemu', 'ui^ushchomu', + 'ui^ushchikh', 'ui^ushchykh', + 'ui^ushchui^u', 'ui^ushchaia', + 'ui^ushchoi^u', 'ui^ushchei^u', + 'i^ushchi^ui^u', 'i^ushchi^ai^a', + 'ui^ushchee', 'ui^ushchie', + 'ui^ushchye', 'ui^ushchoe', 'ui^ushchei`', + 'ui^ushchii`', 'ui^ushchyi`', + 'ui^ushchoi`', 'ui^ushchem', 'ui^ushchim', + 'ui^ushchym', 'ui^ushchom', 'i^ushchimi', + 'i^ushchymi', 'i^ushchego', 'i^ushchogo', + 'i^ushchemu', 'i^ushchomu', 'i^ushchikh', + 'i^ushchykh', 'i^ushchui^u', 'i^ushchai^a', + 'i^ushchoi^u', 'i^ushchei^u', 'i^ushchee', + 'i^ushchie', 'i^ushchye', 'i^ushchoe', + 'i^ushchei`', 'i^ushchii`', + 'i^ushchyi`', 'i^ushchoi`', 'i^ushchem', + 'i^ushchim', 'i^ushchym', 'i^ushchom', + 'shchi^ui^u', 'shchi^ai^a', 'ivshi^ui^u', + 'ivshi^ai^a', 'yvshi^ui^u', 'yvshi^ai^a', + 'shchimi', 'shchymi', 'shchego', 'shchogo', + 'shchemu', 'shchomu', 'shchikh', 'shchykh', + 'shchui^u', 'shchai^a', 'shchoi^u', + 'shchei^u', 'ivshimi', 'ivshymi', + 'ivshego', 'ivshogo', 'ivshemu', 'ivshomu', + 'ivshikh', 'ivshykh', 'ivshui^u', + 'ivshai^a', 'ivshoi^u', 'ivshei^u', + 'yvshimi', 'yvshymi', 'yvshego', 'yvshogo', + 'yvshemu', 'yvshomu', 'yvshikh', 'yvshykh', + 'yvshui^u', 'yvshai^a', 'yvshoi^u', + 'yvshei^u', 'vshi^ui^u', 'vshi^ai^a', + 'shchee', 'shchie', 'shchye', 'shchoe', + 'shchei`', 'shchii`', 'shchyi`', 'shchoi`', + 'shchem', 'shchim', 'shchym', 'shchom', + 'ivshee', 'ivshie', 'ivshye', 'ivshoe', + 'ivshei`', 'ivshii`', 'ivshyi`', + 'ivshoi`', 'ivshem', 'ivshim', 'ivshym', + 'ivshom', 'yvshee', 'yvshie', 'yvshye', + 'yvshoe', 'yvshei`', 'yvshii`', + 'yvshyi`', 'yvshoi`', 'yvshem', + 'yvshim', 'yvshym', 'yvshom', 'vshimi', + 'vshymi', 'vshego', 'vshogo', 'vshemu', + 'vshomu', 'vshikh', 'vshykh', 'vshui^u', + 'vshai^a', 'vshoi^u', 'vshei^u', + 'emi^ui^u', 'emi^ai^a', 'nni^ui^u', + 'nni^ai^a', 'vshee', + 'vshie', 'vshye', 'vshoe', 'vshei`', + 'vshii`', 'vshyi`', 'vshoi`', + 'vshem', 'vshim', 'vshym', 'vshom', + 'emimi', 'emymi', 'emego', 'emogo', + 'ememu', 'emomu', 'emikh', 'emykh', + 'emui^u', 'emai^a', 'emoi^u', 'emei^u', + 'nnimi', 'nnymi', 'nnego', 'nnogo', + 'nnemu', 'nnomu', 'nnikh', 'nnykh', + 'nnui^u', 'nnai^a', 'nnoi^u', 'nnei^u', + 'emee', 'emie', 'emye', 'emoe', + 'emei`', 'emii`', 'emyi`', + 'emoi`', 'emem', 'emim', 'emym', + 'emom', 'nnee', 'nnie', 'nnye', 'nnoe', + 'nnei`', 'nnii`', 'nnyi`', + 'nnoi`', 'nnem', 'nnim', 'nnym', + 'nnom', 'i^ui^u', 'i^ai^a', 'imi', 'ymi', + 'ego', 'ogo', 'emu', 'omu', 'ikh', + 'ykh', 'ui^u', 'ai^a', 'oi^u', 'ei^u', + 'ee', 'ie', 'ye', 'oe', 'ei`', + 'ii`', 'yi`', 'oi`', 'em', + 'im', 'ym', 'om') + __reflexive_suffixes = ("si^a", "s'") + __verb_suffixes = ("esh'", 'ei`te', 'ui`te', 'ui^ut', + "ish'", 'ete', 'i`te', 'i^ut', 'nno', + 'ila', 'yla', 'ena', 'ite', 'ili', 'yli', + 'ilo', 'ylo', 'eno', 'i^at', 'uet', 'eny', + "it'", "yt'", 'ui^u', 'la', 'na', 'li', + 'em', 'lo', 'no', 'et', 'ny', "t'", + 'ei`', 'ui`', 'il', 'yl', 'im', + 'ym', 'en', 'it', 'yt', 'i^u', 'i`', + 'l', 'n') + __noun_suffixes = ('ii^ami', 'ii^akh', 'i^ami', 'ii^am', 'i^akh', + 'ami', 'iei`', 'i^am', 'iem', 'akh', + 'ii^u', "'i^u", 'ii^a', "'i^a", 'ev', 'ov', + 'ie', "'e", 'ei', 'ii', 'ei`', + 'oi`', 'ii`', 'em', 'am', 'om', + 'i^u', 'i^a', 'a', 'e', 'i', 'i`', + 'o', 'u', 'y', "'") + __superlative_suffixes = ("ei`she", "ei`sh") + __derivational_suffixes = ("ost'", "ost") + + def stem(self, word): + """ + Stem a Russian word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + chr_exceeded = False + for i in range(len(word)): + if ord(word[i]) > 255: + chr_exceeded = True + break + + if chr_exceeded: + word = self.__cyrillic_to_roman(word) + + step1_success = False + adjectival_removed = False + verb_removed = False + undouble_success = False + superlative_removed = False + + rv, r2 = self.__regions_russian(word) + + # Step 1 + for suffix in self.__perfective_gerund_suffixes: + if rv.endswith(suffix): + if suffix in ("v", "vshi", "vshis'"): + if (rv[-len(suffix)-3:-len(suffix)] == "i^a" or + rv[-len(suffix)-1:-len(suffix)] == "a"): + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + step1_success = True + break + else: + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + step1_success = True + break + + if not step1_success: + for suffix in self.__reflexive_suffixes: + if rv.endswith(suffix): + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + break + + for suffix in self.__adjectival_suffixes: + if rv.endswith(suffix): + if suffix in ('i^ushchi^ui^u', 'i^ushchi^ai^a', + 'i^ushchui^u', 'i^ushchai^a', 'i^ushchoi^u', + 'i^ushchei^u', 'i^ushchimi', 'i^ushchymi', + 'i^ushchego', 'i^ushchogo', 'i^ushchemu', + 'i^ushchomu', 'i^ushchikh', 'i^ushchykh', + 'shchi^ui^u', 'shchi^ai^a', 'i^ushchee', + 'i^ushchie', 'i^ushchye', 'i^ushchoe', + 'i^ushchei`', 'i^ushchii`', 'i^ushchyi`', + 'i^ushchoi`', 'i^ushchem', 'i^ushchim', + 'i^ushchym', 'i^ushchom', 'vshi^ui^u', + 'vshi^ai^a', 'shchui^u', 'shchai^a', + 'shchoi^u', 'shchei^u', 'emi^ui^u', + 'emi^ai^a', 'nni^ui^u', 'nni^ai^a', + 'shchimi', 'shchymi', 'shchego', 'shchogo', + 'shchemu', 'shchomu', 'shchikh', 'shchykh', + 'vshui^u', 'vshai^a', 'vshoi^u', 'vshei^u', + 'shchee', 'shchie', 'shchye', 'shchoe', + 'shchei`', 'shchii`', 'shchyi`', 'shchoi`', + 'shchem', 'shchim', 'shchym', 'shchom', + 'vshimi', 'vshymi', 'vshego', 'vshogo', + 'vshemu', 'vshomu', 'vshikh', 'vshykh', + 'emui^u', 'emai^a', 'emoi^u', 'emei^u', + 'nnui^u', 'nnai^a', 'nnoi^u', 'nnei^u', + 'vshee', 'vshie', 'vshye', 'vshoe', + 'vshei`', 'vshii`', 'vshyi`', 'vshoi`', + 'vshem', 'vshim', 'vshym', 'vshom', + 'emimi', 'emymi', 'emego', 'emogo', + 'ememu', 'emomu', 'emikh', 'emykh', + 'nnimi', 'nnymi', 'nnego', 'nnogo', + 'nnemu', 'nnomu', 'nnikh', 'nnykh', + 'emee', 'emie', 'emye', 'emoe', 'emei`', + 'emii`', 'emyi`', 'emoi`', 'emem', 'emim', + 'emym', 'emom', 'nnee', 'nnie', 'nnye', + 'nnoe', 'nnei`', 'nnii`', 'nnyi`', 'nnoi`', + 'nnem', 'nnim', 'nnym', 'nnom'): + if (rv[-len(suffix)-3:-len(suffix)] == "i^a" or + rv[-len(suffix)-1:-len(suffix)] == "a"): + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + adjectival_removed = True + break + else: + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + adjectival_removed = True + break + + if not adjectival_removed: + for suffix in self.__verb_suffixes: + if rv.endswith(suffix): + if suffix in ("la", "na", "ete", "i`te", "li", + "i`", "l", "em", "n", "lo", "no", + "et", "i^ut", "ny", "t'", "esh'", + "nno"): + if (rv[-len(suffix)-3:-len(suffix)] == "i^a" or + rv[-len(suffix)-1:-len(suffix)] == "a"): + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + verb_removed = True + break + else: + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + verb_removed = True + break + + if not adjectival_removed and not verb_removed: + for suffix in self.__noun_suffixes: + if rv.endswith(suffix): + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + break + + # Step 2 + if rv.endswith("i"): + word = word[:-1] + r2 = r2[:-1] + + # Step 3 + for suffix in self.__derivational_suffixes: + if r2.endswith(suffix): + word = word[:-len(suffix)] + break + + # Step 4 + if word.endswith("nn"): + word = word[:-1] + undouble_success = True + + if not undouble_success: + for suffix in self.__superlative_suffixes: + if word.endswith(suffix): + word = word[:-len(suffix)] + superlative_removed = True + break + if word.endswith("nn"): + word = word[:-1] + + if not undouble_success and not superlative_removed: + if word.endswith("'"): + word = word[:-1] + + if chr_exceeded: + word = self.__roman_to_cyrillic(word) + + + return word + + + + def __regions_russian(self, word): + """ + Return the regions RV and R2 which are used by the Russian stemmer. + + In any word, RV is the region after the first vowel, + or the end of the word if it contains no vowel. + + R2 is the region after the first non-vowel following + a vowel in R1, or the end of the word if there is no such non-vowel. + + R1 is the region after the first non-vowel following a vowel, + or the end of the word if there is no such non-vowel. + + :param word: The Russian word whose regions RV and R2 are determined. + :type word: str or unicode + :return: the regions RV and R2 for the respective Russian word. + :rtype: tuple + :note: This helper method is invoked by the stem method of the subclass + RussianStemmer. It is not to be invoked directly! + + """ + r1 = "" + r2 = "" + rv = "" + + vowels = ("A", "U", "E", "a", "e", "i", "o", "u", "y") + word = (word.replace("i^a", "A") + .replace("i^u", "U") + .replace("e`", "E")) + + for i in range(1, len(word)): + if word[i] not in vowels and word[i-1] in vowels: + r1 = word[i+1:] + break + + for i in range(1, len(r1)): + if r1[i] not in vowels and r1[i-1] in vowels: + r2 = r1[i+1:] + break + + for i in range(len(word)): + if word[i] in vowels: + rv = word[i+1:] + break + + r2 = (r2.replace("A", "i^a") + .replace("U", "i^u") + .replace("E", "e`")) + rv = (rv.replace("A", "i^a") + .replace("U", "i^u") + .replace("E", "e`")) + + + return (rv, r2) + + + + def __cyrillic_to_roman(self, word): + """ + Transliterate a Russian word into the Roman alphabet. + + A Russian word whose letters consist of the Cyrillic + alphabet are transliterated into the Roman alphabet + in order to ease the forthcoming stemming process. + + :param word: The word that is transliterated. + :type word: unicode + :return: the transliterated word. + :rtype: unicode + :note: This helper method is invoked by the stem method of the subclass + RussianStemmer. It is not to be invoked directly! + + """ + word = (word.replace("\u0410", "a").replace("\u0430", "a") + .replace("\u0411", "b").replace("\u0431", "b") + .replace("\u0412", "v").replace("\u0432", "v") + .replace("\u0413", "g").replace("\u0433", "g") + .replace("\u0414", "d").replace("\u0434", "d") + .replace("\u0415", "e").replace("\u0435", "e") + .replace("\u0401", "e").replace("\u0451", "e") + .replace("\u0416", "zh").replace("\u0436", "zh") + .replace("\u0417", "z").replace("\u0437", "z") + .replace("\u0418", "i").replace("\u0438", "i") + .replace("\u0419", "i`").replace("\u0439", "i`") + .replace("\u041A", "k").replace("\u043A", "k") + .replace("\u041B", "l").replace("\u043B", "l") + .replace("\u041C", "m").replace("\u043C", "m") + .replace("\u041D", "n").replace("\u043D", "n") + .replace("\u041E", "o").replace("\u043E", "o") + .replace("\u041F", "p").replace("\u043F", "p") + .replace("\u0420", "r").replace("\u0440", "r") + .replace("\u0421", "s").replace("\u0441", "s") + .replace("\u0422", "t").replace("\u0442", "t") + .replace("\u0423", "u").replace("\u0443", "u") + .replace("\u0424", "f").replace("\u0444", "f") + .replace("\u0425", "kh").replace("\u0445", "kh") + .replace("\u0426", "t^s").replace("\u0446", "t^s") + .replace("\u0427", "ch").replace("\u0447", "ch") + .replace("\u0428", "sh").replace("\u0448", "sh") + .replace("\u0429", "shch").replace("\u0449", "shch") + .replace("\u042A", "''").replace("\u044A", "''") + .replace("\u042B", "y").replace("\u044B", "y") + .replace("\u042C", "'").replace("\u044C", "'") + .replace("\u042D", "e`").replace("\u044D", "e`") + .replace("\u042E", "i^u").replace("\u044E", "i^u") + .replace("\u042F", "i^a").replace("\u044F", "i^a")) + + + return word + + + + def __roman_to_cyrillic(self, word): + """ + Transliterate a Russian word back into the Cyrillic alphabet. + + A Russian word formerly transliterated into the Roman alphabet + in order to ease the stemming process, is transliterated back + into the Cyrillic alphabet, its original form. + + :param word: The word that is transliterated. + :type word: str or unicode + :return: word, the transliterated word. + :rtype: unicode + :note: This helper method is invoked by the stem method of the subclass + RussianStemmer. It is not to be invoked directly! + + """ + word = (word.replace("i^u", "\u044E").replace("i^a", "\u044F") + .replace("shch", "\u0449").replace("kh", "\u0445") + .replace("t^s", "\u0446").replace("ch", "\u0447") + .replace("e`", "\u044D").replace("i`", "\u0439") + .replace("sh", "\u0448").replace("k", "\u043A") + .replace("e", "\u0435").replace("zh", "\u0436") + .replace("a", "\u0430").replace("b", "\u0431") + .replace("v", "\u0432").replace("g", "\u0433") + .replace("d", "\u0434").replace("e", "\u0435") + .replace("z", "\u0437").replace("i", "\u0438") + .replace("l", "\u043B").replace("m", "\u043C") + .replace("n", "\u043D").replace("o", "\u043E") + .replace("p", "\u043F").replace("r", "\u0440") + .replace("s", "\u0441").replace("t", "\u0442") + .replace("u", "\u0443").replace("f", "\u0444") + .replace("''", "\u044A").replace("y", "\u044B") + .replace("'", "\u044C")) + + + return word + + + +class SpanishStemmer(_StandardStemmer): + + """ + The Spanish Snowball stemmer. + + :cvar __vowels: The Spanish vowels. + :type __vowels: unicode + :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. + :type __step0_suffixes: tuple + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm. + :type __step2a_suffixes: tuple + :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm. + :type __step2b_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :note: A detailed description of the Spanish + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/spanish/stemmer.html + + """ + + __vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xFC" + __step0_suffixes = ("selas", "selos", "sela", "selo", "las", + "les", "los", "nos", "me", "se", "la", "le", + "lo") + __step1_suffixes = ('amientos', 'imientos', 'amiento', 'imiento', + 'aciones', 'uciones', 'adoras', 'adores', + 'ancias', 'log\xEDas', 'encias', 'amente', + 'idades', 'anzas', 'ismos', 'ables', 'ibles', + 'istas', 'adora', 'aci\xF3n', 'antes', + 'ancia', 'log\xEDa', 'uci\xf3n', 'encia', + 'mente', 'anza', 'icos', 'icas', 'ismo', + 'able', 'ible', 'ista', 'osos', 'osas', + 'ador', 'ante', 'idad', 'ivas', 'ivos', + 'ico', + 'ica', 'oso', 'osa', 'iva', 'ivo') + __step2a_suffixes = ('yeron', 'yendo', 'yamos', 'yais', 'yan', + 'yen', 'yas', 'yes', 'ya', 'ye', 'yo', + 'y\xF3') + __step2b_suffixes = ('ar\xEDamos', 'er\xEDamos', 'ir\xEDamos', + 'i\xE9ramos', 'i\xE9semos', 'ar\xEDais', + 'aremos', 'er\xEDais', 'eremos', + 'ir\xEDais', 'iremos', 'ierais', 'ieseis', + 'asteis', 'isteis', '\xE1bamos', + '\xE1ramos', '\xE1semos', 'ar\xEDan', + 'ar\xEDas', 'ar\xE9is', 'er\xEDan', + 'er\xEDas', 'er\xE9is', 'ir\xEDan', + 'ir\xEDas', 'ir\xE9is', + 'ieran', 'iesen', 'ieron', 'iendo', 'ieras', + 'ieses', 'abais', 'arais', 'aseis', + '\xE9amos', 'ar\xE1n', 'ar\xE1s', + 'ar\xEDa', 'er\xE1n', 'er\xE1s', + 'er\xEDa', 'ir\xE1n', 'ir\xE1s', + 'ir\xEDa', 'iera', 'iese', 'aste', 'iste', + 'aban', 'aran', 'asen', 'aron', 'ando', + 'abas', 'adas', 'idas', 'aras', 'ases', + '\xEDais', 'ados', 'idos', 'amos', 'imos', + 'emos', 'ar\xE1', 'ar\xE9', 'er\xE1', + 'er\xE9', 'ir\xE1', 'ir\xE9', 'aba', + 'ada', 'ida', 'ara', 'ase', '\xEDan', + 'ado', 'ido', '\xEDas', '\xE1is', + '\xE9is', '\xEDa', 'ad', 'ed', 'id', + 'an', 'i\xF3', 'ar', 'er', 'ir', 'as', + '\xEDs', 'en', 'es') + __step3_suffixes = ("os", "a", "e", "o", "\xE1", + "\xE9", "\xED", "\xF3") + + def stem(self, word): + """ + Stem a Spanish word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + step1_success = False + + r1, r2 = self._r1r2_standard(word, self.__vowels) + rv = self._rv_standard(word, self.__vowels) + + # STEP 0: Attached pronoun + for suffix in self.__step0_suffixes: + if word.endswith(suffix): + if rv.endswith(suffix): + if rv[:-len(suffix)].endswith(("i\xE9ndo", + "\xE1ndo", + "\xE1r", "\xE9r", + "\xEDr")): + word = (word[:-len(suffix)].replace("\xE1", "a") + .replace("\xE9", "e") + .replace("\xED", "i")) + r1 = (r1[:-len(suffix)].replace("\xE1", "a") + .replace("\xE9", "e") + .replace("\xED", "i")) + r2 = (r2[:-len(suffix)].replace("\xE1", "a") + .replace("\xE9", "e") + .replace("\xED", "i")) + rv = (rv[:-len(suffix)].replace("\xE1", "a") + .replace("\xE9", "e") + .replace("\xED", "i")) + + elif rv[:-len(suffix)].endswith(("ando", "iendo", + "ar", "er", "ir")): + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + + elif (rv[:-len(suffix)].endswith("yendo") and + word[:-len(suffix)].endswith("uyendo")): + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + break + + # STEP 1: Standard suffix removal + for suffix in self.__step1_suffixes: + if word.endswith(suffix): + if suffix == "amente" and r1.endswith(suffix): + step1_success = True + word = word[:-6] + r2 = r2[:-6] + rv = rv[:-6] + + if r2.endswith("iv"): + word = word[:-2] + r2 = r2[:-2] + rv = rv[:-2] + + if r2.endswith("at"): + word = word[:-2] + rv = rv[:-2] + + elif r2.endswith(("os", "ic", "ad")): + word = word[:-2] + rv = rv[:-2] + + elif r2.endswith(suffix): + step1_success = True + if suffix in ("adora", "ador", "aci\xF3n", "adoras", + "adores", "aciones", "ante", "antes", + "ancia", "ancias"): + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + + if r2.endswith("ic"): + word = word[:-2] + rv = rv[:-2] + + elif suffix in ("log\xEDa", "log\xEDas"): + word = word.replace(suffix, "log") + rv = rv.replace(suffix, "log") + + elif suffix in ("uci\xF3n", "uciones"): + word = word.replace(suffix, "u") + rv = rv.replace(suffix, "u") + + elif suffix in ("encia", "encias"): + word = word.replace(suffix, "ente") + rv = rv.replace(suffix, "ente") + + elif suffix == "mente": + word = word[:-5] + r2 = r2[:-5] + rv = rv[:-5] + + if r2.endswith(("ante", "able", "ible")): + word = word[:-4] + rv = rv[:-4] + + elif suffix in ("idad", "idades"): + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + + for pre_suff in ("abil", "ic", "iv"): + if r2.endswith(pre_suff): + word = word[:-len(pre_suff)] + rv = rv[:-len(pre_suff)] + + elif suffix in ("ivo", "iva", "ivos", "ivas"): + word = word[:-len(suffix)] + r2 = r2[:-len(suffix)] + rv = rv[:-len(suffix)] + if r2.endswith("at"): + word = word[:-2] + rv = rv[:-2] + else: + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + break + + # STEP 2a: Verb suffixes beginning 'y' + if not step1_success: + for suffix in self.__step2a_suffixes: + if (rv.endswith(suffix) and + word[-len(suffix)-1:-len(suffix)] == "u"): + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + break + + # STEP 2b: Other verb suffixes + for suffix in self.__step2b_suffixes: + if rv.endswith(suffix): + if suffix in ("en", "es", "\xE9is", "emos"): + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + + if word.endswith("gu"): + word = word[:-1] + + if rv.endswith("gu"): + rv = rv[:-1] + else: + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + break + + # STEP 3: Residual suffix + for suffix in self.__step3_suffixes: + if rv.endswith(suffix): + if suffix in ("e", "\xE9"): + word = word[:-len(suffix)] + rv = rv[:-len(suffix)] + + if word[-2:] == "gu" and rv[-1] == "u": + word = word[:-1] + else: + word = word[:-len(suffix)] + break + + word = (word.replace("\xE1", "a").replace("\xE9", "e") + .replace("\xED", "i").replace("\xF3", "o") + .replace("\xFA", "u")) + + + return word + + + +class SwedishStemmer(_ScandinavianStemmer): + + """ + The Swedish Snowball stemmer. + + :cvar __vowels: The Swedish vowels. + :type __vowels: unicode + :cvar __s_ending: Letters that may directly appear before a word final 's'. + :type __s_ending: unicode + :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. + :type __step1_suffixes: tuple + :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. + :type __step2_suffixes: tuple + :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. + :type __step3_suffixes: tuple + :note: A detailed description of the Swedish + stemming algorithm can be found under + http://snowball.tartarus.org/algorithms/swedish/stemmer.html + + """ + + __vowels = "aeiouy\xE4\xE5\xF6" + __s_ending = "bcdfghjklmnoprtvy" + __step1_suffixes = ("heterna", "hetens", "heter", "heten", + "anden", "arnas", "ernas", "ornas", "andes", + "andet", "arens", "arna", "erna", "orna", + "ande", "arne", "aste", "aren", "ades", + "erns", "ade", "are", "ern", "ens", "het", + "ast", "ad", "en", "ar", "er", "or", "as", + "es", "at", "a", "e", "s") + __step2_suffixes = ("dd", "gd", "nn", "dt", "gt", "kt", "tt") + __step3_suffixes = ("fullt", "l\xF6st", "els", "lig", "ig") + + def stem(self, word): + """ + Stem a Swedish word and return the stemmed form. + + :param word: The word that is stemmed. + :type word: str or unicode + :return: The stemmed form. + :rtype: unicode + + """ + word = word.lower() + + r1 = self._r1_scandinavian(word, self.__vowels) + + # STEP 1 + for suffix in self.__step1_suffixes: + if r1.endswith(suffix): + if suffix == "s": + if word[-2] in self.__s_ending: + word = word[:-1] + r1 = r1[:-1] + else: + word = word[:-len(suffix)] + r1 = r1[:-len(suffix)] + break + + # STEP 2 + for suffix in self.__step2_suffixes: + if r1.endswith(suffix): + word = word[:-1] + r1 = r1[:-1] + break + + # STEP 3 + for suffix in self.__step3_suffixes: + if r1.endswith(suffix): + if suffix in ("els", "lig", "ig"): + word = word[:-len(suffix)] + elif suffix in ("fullt", "l\xF6st"): + word = word[:-1] + break + + + return word + + +class PolishStemmer(_LanguageSpecificStemmer): + """ + The Polish stemmer, implemented based on python stemmer + for Polish language available at: https://github.com/Tutanchamon/pl_stemmer + """ + + def stem(self, word): + word = word.lower() + + stem = word[:] + stem = self.remove_nouns(stem) + stem = self.remove_diminutive(stem) + stem = self.remove_adjective_ends(stem) + stem = self.remove_verbs_ends(stem) + stem = self.remove_adverbs_ends(stem) + stem = self.remove_plural_forms(stem) + stem = self.remove_general_ends(stem) + + return stem + + @staticmethod + def remove_general_ends(word): + # print "DEBUG: END", word[-1:] + if len(word) > 4 and word[-2:] in {"ia", "ie"}: + return word[:-2] + if len(word) > 4 and word[-1:] in {"u", u"ą", "i", "a", u"ę", "y", u"ę", u"ł"}: + return word[:-1] + return word + + @staticmethod + def remove_diminutive(word): + if len(word) > 6: + if word[-5:] in {"eczek", "iczek", "iszek", "aszek", "uszek"}: + return word[:-5] + if word[-4:] in {"enek", "ejek", "erek"}: + return word[:-2] + if len(word) > 4: + if word[-2:] in {"ek", "ak"}: + return word[:-2] + return word + + @staticmethod + def remove_verbs_ends(word): + if len(word) > 5 and word.endswith("bym"): + return word[:-3] + if len(word) > 5 and word[-3:] in {"esz", "asz", "cie", u"eść", u"aść", u"łem", "amy", "emy"}: + return word[:-3] + if len(word) > 3 and word[-3:] in {"esz", "asz", u"eść", u"aść", u"eć", u"ać"}: + return word[:-2] + if len(word) > 3 and word[-3:] in {"aj"}: + return word[:-1] + if len(word) > 3 and word[-2:] in {u"ać", "em", "am", u"ał", u"ił", u"ić", u"ąc"}: + return word[:-2] + return word + + @staticmethod + def remove_nouns(word): + if len(word) > 7 and word[-5:] in {"zacja", u"zacją", "zacji"}: + return word[:-4] + if len(word) > 6 and word[-4:] in {"acja", "acji", u"acją", "tach", "anie", "enie", + "eniu", "aniu"}: + return word[:-4] + if len(word) > 6 and word.endswith("tyka"): + return word[:-2] + if len(word) > 5 and word[-3:] in {"ach", "ami", "nia", "niu", "cia", "ciu"}: + return word[:-3] + if len(word) > 5 and word[-3:] in {"cji", "cja", u"cją"}: + return word[:-2] + if len(word) > 5 and word[-2:] in {"ce", "ta"}: + return word[:-2] + return word + + @staticmethod + def remove_adjective_ends(word): + if len(word) > 7 and word.startswith("naj") and (word.endswith("sze") + or word.endswith("szy")): + return word[3:-3] + if len(word) > 7 and word.startswith("naj") and word.endswith("szych"): + return word[3:-5] + if len(word) > 6 and word.endswith("czny"): + return word[:-4] + if len(word) > 5 and word[-3:] in {"owy", "owa", "owe", "ych", "ego"}: + return word[:-3] + if len(word) > 5 and word[-2:] in {"ej"}: + return word[:-2] + return word + + @staticmethod + def remove_adverbs_ends(word): + if len(word) > 4 and word[:-3] in {"nie", "wie"}: + return word[:-2] + if len(word) > 4 and word.endswith("rze"): + return word[:-2] + return word + + @staticmethod + def remove_plural_forms(word): + if len(word) > 4 and (word.endswith(u"ów") or word.endswith("om")): + return word[:-2] + if len(word) > 4 and word.endswith("ami"): + return word[:-3] + return word + +class ArabicStemmer(_StandardStemmer, _LanguageSpecificStemmer): + + # Normalize_pre stes + __vocalization = re.compile( + r'[\u064b-\u064c-\u064d-\u064e-\u064f-\u0650-\u0651-\u0652]' + ) + + __kasheeda = re.compile(r'[\u0640]') # ـ tatweel/kasheeda + + __arabic_punctuation_marks = re.compile(r'[\u060C-\u061B-\u061F]') # ؛ ، ؟ + + # Normalize_post + __last_hamzat = ('\u0623', '\u0625', '\u0622', '\u0624', '\u0626') # أ، إ، آ، ؤ، ئ + + # normalize other hamza's + __initial_hamzat = re.compile(r'^[\u0622\u0623\u0625]') # أ، إ، آ + + __waw_hamza = re.compile(r'[\u0624]') # ؤ + + __yeh_hamza = re.compile(r'[\u0626]') # ئ + + __alefat = re.compile(r'[\u0623\u0622\u0625]') # أ، إ، آ + + # Checks + __checks1 = ( + '\u0643\u0627\u0644', + '\u0628\u0627\u0644', # بال، كال + '\u0627\u0644', + '\u0644\u0644', # لل، ال + ) + + __checks2 = ('\u0629', '\u0627\u062a') # ة # female plural ات + + # Suffixes + __suffix_noun_step1a = ( + '\u064a', + '\u0643', + '\u0647', # ي، ك، ه + '\u0646\u0627', + '\u0643\u0645', + '\u0647\u0627', + '\u0647\u0646', + '\u0647\u0645', # نا، كم، ها، هن، هم + '\u0643\u0645\u0627', + '\u0647\u0645\u0627', # كما، هما + ) + + __suffix_noun_step1b = '\u0646' # ن + + __suffix_noun_step2a = ('\u0627', '\u064a', '\u0648') # ا، ي، و + + __suffix_noun_step2b = '\u0627\u062a' # ات + + __suffix_noun_step2c1 = '\u062a' # ت + + __suffix_noun_step2c2 = '\u0629' # ة + + __suffix_noun_step3 = '\u064a' # ي + + __suffix_verb_step1 = ( + '\u0647', + '\u0643', # ه، ك + '\u0646\u064a', + '\u0646\u0627', + '\u0647\u0627', + '\u0647\u0645', # ني، نا، ها، هم + '\u0647\u0646', + '\u0643\u0645', + '\u0643\u0646', # هن، كم، كن + '\u0647\u0645\u0627', + '\u0643\u0645\u0627', + '\u0643\u0645\u0648', # هما، كما، كمو + ) + + __suffix_verb_step2a = ( + '\u062a', + '\u0627', + '\u0646', + '\u064a', # ت، ا، ن، ي + '\u0646\u0627', + '\u062a\u0627', + '\u062a\u0646', # نا، تا، تن Past + '\u0627\u0646', + '\u0648\u0646', + '\u064a\u0646', # ان، هن، ين Present + '\u062a\u0645\u0627', # تما + ) + + __suffix_verb_step2b = ('\u0648\u0627', '\u062a\u0645') # وا، تم + + __suffix_verb_step2c = ('\u0648', '\u062a\u0645\u0648') # و # تمو + + __suffix_all_alef_maqsura = '\u0649' # ى + + # Prefixes + __prefix_step1 = ( + '\u0623', # أ + '\u0623\u0623', + '\u0623\u0622', + '\u0623\u0624', + '\u0623\u0627', + '\u0623\u0625', # أأ، أآ، أؤ، أا، أإ + ) + + __prefix_step2a = ('\u0641\u0627\u0644', '\u0648\u0627\u0644') # فال، وال + + __prefix_step2b = ('\u0641', '\u0648') # ف، و + + __prefix_step3a_noun = ( + '\u0627\u0644', + '\u0644\u0644', # لل، ال + '\u0643\u0627\u0644', + '\u0628\u0627\u0644', # بال، كال + ) + + __prefix_step3b_noun = ( + '\u0628', + '\u0643', + '\u0644', # ب، ك، ل + '\u0628\u0628', + '\u0643\u0643', # بب، كك + ) + + __prefix_step3_verb = ( + '\u0633\u064a', + '\u0633\u062a', + '\u0633\u0646', + '\u0633\u0623', + ) # سي، ست، سن، سأ + + __prefix_step4_verb = ( + '\u064a\u0633\u062a', + '\u0646\u0633\u062a', + '\u062a\u0633\u062a', + ) # يست، نست، تست + + # Suffixes added due to Conjugation Verbs + __conjugation_suffix_verb_1 = ('\u0647', '\u0643') # ه، ك + + __conjugation_suffix_verb_2 = ( + '\u0646\u064a', + '\u0646\u0627', + '\u0647\u0627', # ني، نا، ها + '\u0647\u0645', + '\u0647\u0646', + '\u0643\u0645', # هم، هن، كم + '\u0643\u0646', # كن + ) + __conjugation_suffix_verb_3 = ( + '\u0647\u0645\u0627', + '\u0643\u0645\u0627', + '\u0643\u0645\u0648', + ) # هما، كما، كمو + + __conjugation_suffix_verb_4 = ('\u0627', '\u0646', '\u064a') # ا، ن، ي + + __conjugation_suffix_verb_past = ( + '\u0646\u0627', + '\u062a\u0627', + '\u062a\u0646', + ) # نا، تا، تن + + __conjugation_suffix_verb_present = ( + '\u0627\u0646', + '\u0648\u0646', + '\u064a\u0646', + ) # ان، ون، ين + + # Suffixes added due to derivation Names + __conjugation_suffix_noun_1 = ('\u064a', '\u0643', '\u0647') # ي، ك، ه + + __conjugation_suffix_noun_2 = ( + '\u0646\u0627', + '\u0643\u0645', # نا، كم + '\u0647\u0627', + '\u0647\u0646', + '\u0647\u0645', # ها، هن، هم + ) + + __conjugation_suffix_noun_3 = ( + '\u0643\u0645\u0627', + '\u0647\u0645\u0627', + ) # كما، هما + + # Prefixes added due to derivation Names + __prefixes1 = ('\u0648\u0627', '\u0641\u0627') # فا، وا + + __articles_3len = ('\u0643\u0627\u0644', '\u0628\u0627\u0644') # بال كال + + __articles_2len = ('\u0627\u0644', '\u0644\u0644') # ال لل + + # Prepositions letters + __prepositions1 = ('\u0643', '\u0644') # ك، ل + __prepositions2 = ('\u0628\u0628', '\u0643\u0643') # بب، كك + + is_verb = True + is_noun = True + is_defined = False + + suffixes_verb_step1_success = False + suffix_verb_step2a_success = False + suffix_verb_step2b_success = False + suffix_noun_step2c2_success = False + suffix_noun_step1a_success = False + suffix_noun_step2a_success = False + suffix_noun_step2b_success = False + suffixe_noun_step1b_success = False + prefix_step2a_success = False + prefix_step3a_noun_success = False + prefix_step3b_noun_success = False + + def __normalize_pre(self, token): + """ + :param token: string + :return: normalized token type string + """ + # strip diacritics + token = self.__vocalization.sub('', token) + # strip kasheeda + token = self.__kasheeda.sub('', token) + # strip punctuation marks + token = self.__arabic_punctuation_marks.sub('', token) + return token + + def __normalize_post(self, token): + # normalize last hamza + for hamza in self.__last_hamzat: + if token.endswith(hamza): + token = suffix_replace(token, hamza, '\u0621') + break + # normalize other hamzat + token = self.__initial_hamzat.sub('\u0627', token) + token = self.__waw_hamza.sub('\u0648', token) + token = self.__yeh_hamza.sub('\u064a', token) + token = self.__alefat.sub('\u0627', token) + return token + + def __checks_1(self, token): + for prefix in self.__checks1: + if token.startswith(prefix): + if prefix in self.__articles_3len and len(token) > 4: + self.is_noun = True + self.is_verb = False + self.is_defined = True + break + + if prefix in self.__articles_2len and len(token) > 3: + self.is_noun = True + self.is_verb = False + self.is_defined = True + break + + def __checks_2(self, token): + for suffix in self.__checks2: + if token.endswith(suffix): + if suffix == '\u0629' and len(token) > 2: + self.is_noun = True + self.is_verb = False + break + + if suffix == '\u0627\u062a' and len(token) > 3: + self.is_noun = True + self.is_verb = False + break + + def __Suffix_Verb_Step1(self, token): + for suffix in self.__suffix_verb_step1: + if token.endswith(suffix): + if suffix in self.__conjugation_suffix_verb_1 and len(token) >= 4: + token = token[:-1] + self.suffixes_verb_step1_success = True + break + + if suffix in self.__conjugation_suffix_verb_2 and len(token) >= 5: + token = token[:-2] + self.suffixes_verb_step1_success = True + break + + if suffix in self.__conjugation_suffix_verb_3 and len(token) >= 6: + token = token[:-3] + self.suffixes_verb_step1_success = True + break + return token + + def __Suffix_Verb_Step2a(self, token): + for suffix in self.__suffix_verb_step2a: + if token.endswith(suffix) and len(token) > 3: + if suffix == '\u062a' and len(token) >= 4: + token = token[:-1] + self.suffix_verb_step2a_success = True + break + + if suffix in self.__conjugation_suffix_verb_4 and len(token) >= 4: + token = token[:-1] + self.suffix_verb_step2a_success = True + break + + if suffix in self.__conjugation_suffix_verb_past and len(token) >= 5: + token = token[:-2] # past + self.suffix_verb_step2a_success = True + break + + if suffix in self.__conjugation_suffix_verb_present and len(token) > 5: + token = token[:-2] # present + self.suffix_verb_step2a_success = True + break + + if suffix == '\u062a\u0645\u0627' and len(token) >= 6: + token = token[:-3] + self.suffix_verb_step2a_success = True + break + return token + + def __Suffix_Verb_Step2c(self, token): + for suffix in self.__suffix_verb_step2c: + if token.endswith(suffix): + if suffix == '\u062a\u0645\u0648' and len(token) >= 6: + token = token[:-3] + break + + if suffix == '\u0648' and len(token) >= 4: + token = token[:-1] + break + return token + + def __Suffix_Verb_Step2b(self, token): + for suffix in self.__suffix_verb_step2b: + if token.endswith(suffix) and len(token) >= 5: + token = token[:-2] + self.suffix_verb_step2b_success = True + break + return token + + def __Suffix_Noun_Step2c2(self, token): + for suffix in self.__suffix_noun_step2c2: + if token.endswith(suffix) and len(token) >= 3: + token = token[:-1] + self.suffix_noun_step2c2_success = True + break + return token + + def __Suffix_Noun_Step1a(self, token): + for suffix in self.__suffix_noun_step1a: + if token.endswith(suffix): + if suffix in self.__conjugation_suffix_noun_1 and len(token) >= 4: + token = token[:-1] + self.suffix_noun_step1a_success = True + break + + if suffix in self.__conjugation_suffix_noun_2 and len(token) >= 5: + token = token[:-2] + self.suffix_noun_step1a_success = True + break + + if suffix in self.__conjugation_suffix_noun_3 and len(token) >= 6: + token = token[:-3] + self.suffix_noun_step1a_success = True + break + return token + + def __Suffix_Noun_Step2a(self, token): + for suffix in self.__suffix_noun_step2a: + if token.endswith(suffix) and len(token) > 4: + token = token[:-1] + self.suffix_noun_step2a_success = True + break + return token + + def __Suffix_Noun_Step2b(self, token): + for suffix in self.__suffix_noun_step2b: + if token.endswith(suffix) and len(token) >= 5: + token = token[:-2] + self.suffix_noun_step2b_success = True + break + return token + + def __Suffix_Noun_Step2c1(self, token): + for suffix in self.__suffix_noun_step2c1: + if token.endswith(suffix) and len(token) >= 4: + token = token[:-1] + break + return token + + def __Suffix_Noun_Step1b(self, token): + for suffix in self.__suffix_noun_step1b: + if token.endswith(suffix) and len(token) > 5: + token = token[:-1] + self.suffixe_noun_step1b_success = True + break + return token + + def __Suffix_Noun_Step3(self, token): + for suffix in self.__suffix_noun_step3: + if token.endswith(suffix) and len(token) >= 3: + token = token[:-1] # ya' nisbiya + break + return token + + def __Suffix_All_alef_maqsura(self, token): + for suffix in self.__suffix_all_alef_maqsura: + if token.endswith(suffix): + token = suffix_replace(token, suffix, '\u064a') + return token + + def __Prefix_Step1(self, token): + for prefix in self.__prefix_step1: + if token.startswith(prefix) and len(token) > 3: + if prefix == '\u0623\u0623': + token = prefix_replace(token, prefix, '\u0623') + break + + elif prefix == '\u0623\u0622': + token = prefix_replace(token, prefix, '\u0622') + break + + elif prefix == '\u0623\u0624': + token = prefix_replace(token, prefix, '\u0624') + break + + elif prefix == '\u0623\u0627': + token = prefix_replace(token, prefix, '\u0627') + break + + elif prefix == '\u0623\u0625': + token = prefix_replace(token, prefix, '\u0625') + break + return token + + def __Prefix_Step2a(self, token): + for prefix in self.__prefix_step2a: + if token.startswith(prefix) and len(token) > 5: + token = token[len(prefix) :] + self.prefix_step2a_success = True + break + return token + + def __Prefix_Step2b(self, token): + for prefix in self.__prefix_step2b: + if token.startswith(prefix) and len(token) > 3: + if token[:2] not in self.__prefixes1: + token = token[len(prefix) :] + break + return token + + def __Prefix_Step3a_Noun(self, token): + for prefix in self.__prefix_step3a_noun: + if token.startswith(prefix): + if prefix in self.__articles_2len and len(token) > 4: + token = token[len(prefix) :] + self.prefix_step3a_noun_success = True + break + if prefix in self.__articles_3len and len(token) > 5: + token = token[len(prefix) :] + break + return token + + def __Prefix_Step3b_Noun(self, token): + for prefix in self.__prefix_step3b_noun: + if token.startswith(prefix): + if len(token) > 3: + if prefix == '\u0628': + token = token[len(prefix) :] + self.prefix_step3b_noun_success = True + break + + if prefix in self.__prepositions2: + token = prefix_replace(token, prefix, prefix[1]) + self.prefix_step3b_noun_success = True + break + + if prefix in self.__prepositions1 and len(token) > 4: + token = token[len(prefix) :] # BUG: cause confusion + self.prefix_step3b_noun_success = True + break + return token + + def __Prefix_Step3_Verb(self, token): + for prefix in self.__prefix_step3_verb: + if token.startswith(prefix) and len(token) > 4: + token = prefix_replace(token, prefix, prefix[1]) + break + return token + + def __Prefix_Step4_Verb(self, token): + for prefix in self.__prefix_step4_verb: + if token.startswith(prefix) and len(token) > 4: + token = prefix_replace(token, prefix, '\u0627\u0633\u062a') + self.is_verb = True + self.is_noun = False + break + return token + + def stem(self, word): + """ + Stem an Arabic word and return the stemmed form. + :param word: string + :return: string + """ + # set initial values + self.is_verb = True + self.is_noun = True + self.is_defined = False + + self.suffix_verb_step2a_success = False + self.suffix_verb_step2b_success = False + self.suffix_noun_step2c2_success = False + self.suffix_noun_step1a_success = False + self.suffix_noun_step2a_success = False + self.suffix_noun_step2b_success = False + self.suffixe_noun_step1b_success = False + self.prefix_step2a_success = False + self.prefix_step3a_noun_success = False + self.prefix_step3b_noun_success = False + + modified_word = word + # guess type and properties + # checks1 + self.__checks_1(modified_word) + # checks2 + self.__checks_2(modified_word) + # Pre_Normalization + modified_word = self.__normalize_pre(modified_word) + # Start stemming + if self.is_verb: + modified_word = self.__Suffix_Verb_Step1(modified_word) + if self.suffixes_verb_step1_success: + modified_word = self.__Suffix_Verb_Step2a(modified_word) + if not self.suffix_verb_step2a_success: + modified_word = self.__Suffix_Verb_Step2c(modified_word) + # or next TODO: How to deal with or next instruction + else: + modified_word = self.__Suffix_Verb_Step2b(modified_word) + if not self.suffix_verb_step2b_success: + modified_word = self.__Suffix_Verb_Step2a(modified_word) + if self.is_noun: + modified_word = self.__Suffix_Noun_Step2c2(modified_word) + if not self.suffix_noun_step2c2_success: + if not self.is_defined: + modified_word = self.__Suffix_Noun_Step1a(modified_word) + # if self.suffix_noun_step1a_success: + modified_word = self.__Suffix_Noun_Step2a(modified_word) + if not self.suffix_noun_step2a_success: + modified_word = self.__Suffix_Noun_Step2b(modified_word) + if ( + not self.suffix_noun_step2b_success + and not self.suffix_noun_step2a_success + ): + modified_word = self.__Suffix_Noun_Step2c1(modified_word) + # or next ? todo : how to deal with or next + else: + modified_word = self.__Suffix_Noun_Step1b(modified_word) + if self.suffixe_noun_step1b_success: + modified_word = self.__Suffix_Noun_Step2a(modified_word) + if not self.suffix_noun_step2a_success: + modified_word = self.__Suffix_Noun_Step2b(modified_word) + if ( + not self.suffix_noun_step2b_success + and not self.suffix_noun_step2a_success + ): + modified_word = self.__Suffix_Noun_Step2c1(modified_word) + else: + if not self.is_defined: + modified_word = self.__Suffix_Noun_Step2a(modified_word) + modified_word = self.__Suffix_Noun_Step2b(modified_word) + modified_word = self.__Suffix_Noun_Step3(modified_word) + if not self.is_noun and self.is_verb: + modified_word = self.__Suffix_All_alef_maqsura(modified_word) + + # prefixes + modified_word = self.__Prefix_Step1(modified_word) + modified_word = self.__Prefix_Step2a(modified_word) + if not self.prefix_step2a_success: + modified_word = self.__Prefix_Step2b(modified_word) + modified_word = self.__Prefix_Step3a_Noun(modified_word) + if not self.prefix_step3a_noun_success and self.is_noun: + modified_word = self.__Prefix_Step3b_Noun(modified_word) + else: + if not self.prefix_step3b_noun_success and self.is_verb: + modified_word = self.__Prefix_Step3_Verb(modified_word) + modified_word = self.__Prefix_Step4_Verb(modified_word) + + # post normalization stemming + modified_word = self.__normalize_post(modified_word) + stemmed_word = modified_word + return stemmed_word diff --git a/summa/preprocessing/stopwords.py b/summa/preprocessing/stopwords.py new file mode 100644 index 0000000..3527d68 --- /dev/null +++ b/summa/preprocessing/stopwords.py @@ -0,0 +1,210 @@ +english = """ +all six eleven just less being indeed over both anyway detail four front already through yourselves fify +mill still its before move whose one system also somewhere herself thick show had enough should to only +seeming under herein ours two has might thereafter do them his around thereby get very de none cannot +every whether they not during thus now him nor name regarding several hereafter did always cry whither +beforehand this someone she each further become thereupon where side towards few twelve because often ten +anyhow doing km eg some back used go namely besides yet are cant our beyond ourselves sincere out even +what throughout computer give for bottom mine since please while per find everything behind does various +above between kg neither seemed ever across t somehow be we who were sixty however here otherwise whereupon +nowhere although found hers re along quite fifteen by on about didn last would anything via of could thence +put against keep etc s became ltd hence therein onto or whereafter con among own co afterwards formerly +within seems into others whatever yourself down alone everyone done least another whoever moreover couldnt +must your three from her their together top there due been next anyone whom much call too interest thru +themselves hundred was until empty more himself elsewhere mostly that fire becomes becoming hereby but +else part everywhere former don with than those he me forty myself made full twenty these bill using up us +will nevertheless below anywhere nine can theirs toward my something and sometimes whenever sometime then +almost wherever is describe am it doesn an really as itself at have in seem whence ie any if again hasnt +inc un thin no perhaps latter meanwhile when amount same wherein beside how other take which latterly you +fill either nobody unless whereas see though may after upon therefore most hereupon eight amongst never +serious nothing such why a off whereby third i whole noone many well except amoungst yours rather without +so five the first having once +""" + +spanish = """ +un una unas unos uno sobre todo tambien tras otro algun alguno alguna algunos algunas ser es soy eres somos +sois estoy esta estamos estais estan como en para atras porque por que estado estaba ante antes siendo ambos +pero por poder puede puedo podemos podeis pueden fui fue fuimos fueron hacer hago hace hacemos haceis hacen +cada fin incluso primero desde conseguir consigo consigue consigues conseguimos consiguen ir voy va vamos +vais van vaya gueno ha tener tengo tiene tenemos teneis tienen el la lo las los su aqui mio tuyo ellos ellas +nos nosotros vosotros vosotras si dentro solo solamente saber sabes sabe sabemos sabeis saben ultimo largo +bastante haces muchos aquellos aquellas sus entonces tiempo verdad verdadero verdadera cierto ciertos cierta +ciertas intentar intento intenta intentas intentamos intentais intentan dos bajo arriba encima usar uso usas +usa usamos usais usan emplear empleo empleas emplean ampleamos empleais valor muy era eras eramos eran modo +bien cual cuando donde mientras quien con entre sin trabajo trabajar trabajas trabaja trabajamos trabajais +trabajan podria podrias podriamos podrian podriais yo aquel a acabar actualmente acuerdo adelante ademas +ademas adrede afirmo agrego ahi ahora ahi al algo alguna algunas alguno algunos algun alla alli alli alrededor +ambos antano antano ante anterior antes apenas aproximadamente aquel aquella aquellas aquello aquellos aqui +aquel aquella aquellas aquellos aqui arribaabajo aseguro asi asi aun aunque ayer anadio aun b bajo bastante +bien breve buen buena buenas bueno buenos c cada casi cerca cierto cinco claro comento como con conmigo +conocer considera considero contigo contra cosa cosas creo cual cuales cualquier cuando cuanta cuantas cuanto +cuantos cuatro cuenta cuyo cual cuales cuando cuanta cuantas cuanto cuantos como d da dado dan dar de debajo +debe deben deber debido decir dejo del delante demasiado demas dentro deprisa desde despacio despues despues +detras detras dia dias dice dicen dicho dieron diferente diferentes dijeron dijo dio donde dos durante dia +dias donde e ejemplo el ella ellas ello ellos embargo en encima encuentra enfrente enseguida entonces entre +era erais eramos eran eras eres es esa esas ese eso esos esta estaba estabais estabamos estaban estabas estad +estada estadas estado estados estais estamos estan estando estar estara estaran estaras estare estareis +estaremos estaria estariais estariamos estarian estarias estara estas este esteis estemos esten estes esto +estos estoy estuve estuviera estuvierais estuvieramos estuvieran estuvieras estuvieron estuviese estuvieseis +estuviesemos estuviesen estuvieses estuvimos estuviste estuvisteis estuvo esta estan ex excepto existe existen +explico expreso f fin final fue fuera fuerais fueramos fueran fueras fueron fuese fueseis fuesemos fuesen +fueses fui fuimos fuiste fuisteis g general gran grande grandes gustar h ha habeis haber habia habiais habiamos +habian habias habida habidas habido habidos habiendo habla hablan habra habran habras habre habreis habremos +habria habriais habriamos habrian habrias habra habia habian hace hacen hacer hacerlo hacia haciendo han has +hasta hay haya hayais hayamos hayan hayas he hecho hemos hicieron hizo horas hoy hube hubiera hubierais +hubieramos hubieran hubieras hubieron hubiese hubieseis hubiesemos hubiesen hubieses hubimos hubiste hubisteis +hubo i igual incluso indico informo informo ir j jamas junto k l la lado las le lejos les llego lleva llevar +lo los luego lugar m mal manera manifesto mas mayor me mediante medio mejor menciono menos menudo mi mia mias +mientras mio mios mis misma mismas mismo mismos momento mucha muchas mucho muchos muy mas mi mia mias mio mios +n nada nadie ni ningun ninguna ningunas ninguno ningunos ningun no nos nosotras nosotros nuestra nuestras +nuestro nuestros nueva nuevas nuevo nuevos nunca o ocho os otra otras otro otros p pais para parece parte +partir pasada pasado pasar pais peor pequeno pero pesar poca pocas poco pocos podemos poder podra podran podria +podrian poner por porque posible primer primera primero primeros principalmente pronto propia propias propio +propios proximo proximo proximos pudo pueda puede pueden pues q qeu que quedo queremos querer quien quienes +quiere quiza quizas quiza quizas quien quienes que r raras realizado realizar realizo repente respecto s saber +salvo se sea seais seamos sean seas seguir segun segunda segundo segun seis senor senora ser sera seran seras +sere sereis seremos seria seriais seriamos serian serias sera seran seria senalo si sido siempre siendo siete +sigue siguiente sin sino sisi sobre sois sola solamente solas solo solos somos son soy soyos su supuesto sus +suya suyas suyo suyos se si solo t tal tambien tambien tampoco tan tanto tarde te temprano tendra tendran +tendras tendre tendreis tendremos tendria tendriais tendriamos tendrian tendrias tendra tendran tened teneis +tenemos tener tenga tengais tengamos tengan tengas tengo tenia teniais teniamos tenian tenias tenida tenidas +tenido tenidos teniendo tenia tercera ti tiene tienen tienes toda todas todavia todavia todo todos tomar total +tras trata traves tres tu tus tuve tuviera tuvierais tuvieramos tuvieran tuvieras tuvieron tuviese tuvieseis +tuviesemos tuviesen tuvieses tuvimos tuviste tuvisteis tuvo tuya tuyas tuyo tuyos tu u un una unas uno unos +usted ustedes v va vamos van varias varios veces venir ver vez volver vosotras vosotros vuestra vuestras vuestro +vuestros w x y ya yo z el esa esas ese esos esta estas este estos ultima ultimas ultimo ultimos +""" + +german = """ +aber als am an auch auf aus bei bin bis bist da dadurch daher darum das daß dass dein deine dem den der des +dessen deshalb die dies dieser dieses doch dort du durch ein eine einem einen einer eines er es euer eure fur +hatte hatten hattest hattet hierhinter ich ihr ihre im in ist ja jede jedem jeden jeder jedes jener jenes jetzt +kann kannst konnen konnt machen mein meine mit muß mußt musst mussen mußt nach nachdem nein nicht nun oder seid +sein seine sich sie sind soll sollen sollst sollt sonst soweit sowie und unserunsere unter vom von vor wann +warum was weiter weitere wenn wer werde werden werdet weshalb wie wieder wieso wir wird wirst wo woher wohin zu +zum zur uber +""" + +portuguese = """ +de a o que e do da em um para é com não uma os no se na por mais as dos como mas foi ao ele das tem à seu +sua ou ser quando muito há nos já está eu também só pelo pela até isso ela entre era depois sem mesmo aos ter +seus quem nas me esse eles estão você tinha foram essa num nem suas meu às minha têm numa pelos elas havia seja +qual será nós tenho lhe deles essas esses pelas este fosse dele tu te vocês vos lhes meus minhas teu tua teus +tuas nosso nossa nossos nossas dela delas esta estes estas aquele aquela aqueles aquelas isto aquilo estou está +estamos estão estive esteve estivemos estiveram estava estávamos estavam estivera estivéramos esteja estejamos +estejam estivesse estivéssemos estivessem estiver estivermos estiverem hei há havemos hão houve houvemos houveram +houvera houvéramos haja hajamos hajam houvesse houvéssemos houvessem houver houvermos houverem houverei houverá +houveremos houverão houveria houveríamos houveriam sou somos são era éramos eram fui foi fomos foram fora fôramos +seja sejamos sejam fosse fôssemos fossem for formos forem serei será seremos serão seria seríamos seriam tenho +tem temos tém tinha tínhamos tinham tive teve tivemos tiveram tivera tivéramos tenha tenhamos tenham tivesse +tivéssemos tivessem tiver tivermos tiverem terei terá teremos terão teria teríamos teriam +""" + +swedish = """ +aderton adertonde adjö aldrig alla allas allt alltid alltså andra andras annan annat artonde artonn att av bakom +bara behöva behövas behövde behövt beslut beslutat beslutit bland blev bli blir blivit bort borta bra bäst bättre +båda bådas dag dagar dagarna dagen de del delen dem den denna deras dess dessa det detta dig din dina dit ditt +dock dom du där därför då e efter eftersom ej elfte eller elva emot en enkel enkelt enkla enligt ens er era ers +ert ett ettusen fanns fem femte femtio femtionde femton femtonde fick fin finnas finns fjorton fjortonde fjärde +fler flera flesta fram framför från fyra fyrtio fyrtionde få får fått följande för före förlåt förra första +genast genom gick gjorde gjort god goda godare godast gott gälla gäller gällt gärna gå går gått gör göra ha hade +haft han hans har heller hellre helst helt henne hennes hit hon honom hundra hundraen hundraett hur här hög höger +högre högst i ibland icke idag igen igår imorgon in inför inga ingen ingenting inget innan inne inom inte inuti +ja jag jo ju just jämfört kan kanske knappast kom komma kommer kommit kr kunde kunna kunnat kvar legat ligga +ligger lika likställd likställda lilla lite liten litet länge längre längst lätt lättare lättast långsam +långsammare långsammast långsamt långt låt man med mej mellan men mer mera mest mig min mina mindre minst mitt +mittemot mot mycket många måste möjlig möjligen möjligt möjligtvis ned nederst nedersta nedre nej ner ni nio +nionde nittio nittionde nitton nittonde nog noll nr nu nummer när nästa någon någonting något några nån nånting +nåt nödvändig nödvändiga nödvändigt nödvändigtvis och också ofta oftast olika olikt om oss på rakt redan rätt sa +sade sagt samma sedan senare senast sent sex sextio sextionde sexton sextonde sig sin sina sist sista siste sitt +sitta sju sjunde sjuttio sjuttionde sjutton sjuttonde själv sjätte ska skall skulle slutligen små smått snart som +stor stora stort större störst säga säger sämre sämst så sådan sådana sådant ta tack tar tidig tidigare tidigast +tidigt till tills tillsammans tio tionde tjugo tjugoen tjugoett tjugonde tjugotre tjugotvå tjungo tolfte tolv tre +tredje trettio trettionde tretton trettonde två tvåhundra under upp ur ursäkt ut utan utanför ute va vad var vara +varför varifrån varit varje varken vars varsågod vart vem vems verkligen vi vid vidare viktig viktigare viktigast +viktigt vilka vilkas vilken vilket vill väl vänster vänstra värre vår våra vårt än ännu är även åt åtminstone +åtta åttio åttionde åttonde över övermorgon överst övre +""" + +danish = """ +ad af aldrig alle alt anden andet andre at bare begge blev blive bliver da de dem den denne der deres det dette +dig din dine disse dit dog du efter ej eller en end ene eneste enhver er et far fem fik fire flere fleste for +fordi forrige fra få får før god godt ham han hans har havde have hej helt hende hendes her hos hun hvad hvem +hver hvilken hvis hvor hvordan hvorfor hvornår i ikke ind ingen intet ja jeg jer jeres jo kan kom komme kommer +kun kunne lad lav lidt lige lille man mand mange med meget men mens mere mig min mine mit mod må ned nej ni nogen +noget nogle nu ny nyt når nær næste næsten og også okay om op os otte over på se seks selv ser ses sig sige +sin sine sit skal skulle som stor store syv så sådan tag tage thi ti til to tre ud under var ved vi vil ville +vor vores være været alene allerede alligevel altid bag blandt burde bør dens derefter derfor derfra deri dermed +derpå derved egen ellers endnu ens enten flest foran først gennem gjorde gjort gør gøre gørende hel heller hen +henover herefter heri hermed herpå hvilke hvilkes hvorefter hvorfra hvorhen hvori hvorimod hvorved igen igennem +imellem imens imod indtil langs lave lavet ligesom længere mellem mest mindre mindst måske nemlig nogensinde nok +omkring overalt samme sammen selvom senere siden stadig synes syntes således temmelig tidligere tilbage tit uden +udover undtagen via vore vær øvrigt +""" + +italian = """ +un avete dal voi nostri avesti stiano starò sull tutto faccio sarai vostri farebbe ai degli farò c faccia lo +sullo farà facevate avendo fummo stiamo staranno questi sia con sue al mio fareste ero di e avessi alle avreste +avesse alla avrei avemmo col ad ne avremmo avevano tuo avessero siate suoi facevo ti che mi questa avrebbe fossero +tua starebbero faceste facesti anche cui ho tra foste stavamo non stessi avevate nostre quelli queste avrete eri +facemmo stavate stia in dagl avrò avremo se feci furono io stavano nelle quante per abbiano nell faceva fecero steste +eravamo farei sarei avevi sui quanto dai dello era loro su quello fossi stava nostra quale una farete gli siano avranno +i stette fece negli facciano facevano dove vostra farebbero sugli vostro uno aveva dall ha avuto avuti sarete sulla sarà +perché essendo fai siete facendo da avevamo starà o faranno lei mie stiate nel fu facciamo stessero noi facciate stando +si è avute sarebbero miei sto contro avrà coi chi ci avrebbero aveste stettero abbiamo sarebbe agl del stareste sua faremo +siamo fanno sei abbiate fui ed quella dalle facessero tue fosti facevamo erano stessimo nei facessimo nello le dell abbia +fosse farai facesse starai stavo staremo mia stesse avevo lui agli fossimo dagli vostre stanno sareste quanti stemmo facessi +ebbe stesti tuoi dallo tutti sugl staremmo vi la dei quanta ebbero stavi saranno delle dalla saresti staresti stai suo nostro +aremo starete saremmo sarò li hai allo avresti dov avuta faresti starei il quelle degl all a ebbi nella eravate stetti negl +come questo facevi sulle più tu della sono starebbe sul hanno faremmo sta avrai avessimo ma l +""" + +# stopwords from https://github.com/bieli/stopwords repository +polish = """ +a aby ach acz aczkolwiek aj albo ale alez ależ ani az aż bardziej bardzo beda bedzie bez deda będą bede będę +będzie bo bowiem by byc być byl byla byli bylo byly był była było były bynajmniej cala cali caly cała cały ci +cie ciebie cię co cokolwiek cos coś czasami czasem czemu czy czyli daleko dla dlaczego dlatego do dobrze +dokad dokąd dosc dość duzo dużo dwa dwaj dwie dwoje dzis dzisiaj dziś gdy gdyby gdyz gdyż gdzie gdziekolwiek +gdzies gdzieś go i ich ile im inna inne inny innych iz iż ja jak jakas jakaś jakby jaki jakichs jakichś jakie +jakis jakiś jakiz jakiż jakkolwiek jako jakos jakoś ją je jeden jedna jednak jednakze jednakże jedno jego jej +jemu jesli jest jestem jeszcze jeśli jezeli jeżeli juz już kazdy każdy kiedy kilka kims kimś kto ktokolwiek +ktora ktore ktorego ktorej ktory ktorych ktorym ktorzy ktos ktoś która które którego której który których +którym którzy ku lat lecz lub ma mają mało mam mi miedzy między mimo mna mną mnie moga mogą moi moim moj +moja moje moze mozliwe mozna może możliwe można mój mu musi my na nad nam nami nas nasi nasz nasza nasze +naszego naszych natomiast natychmiast nawet nia nią nic nich nie niech niego niej niemu nigdy nim nimi niz +niż no o obok od około on ona one oni ono oraz oto owszem pan pana pani po pod podczas pomimo ponad poniewaz +ponieważ powinien powinna powinni powinno poza prawie przeciez przecież przed przede przedtem przez przy roku +rowniez również sam sama są sie się skad skąd soba sobą sobie sposob sposób swoje ta tak taka taki takie +takze także tam te tego tej ten teraz też to toba tobą tobie totez toteż totobą trzeba tu tutaj twoi twoim +twoj twoja twoje twój twym ty tych tylko tym u w wam wami was wasz wasza wasze we według wiele wielu więc +więcej wlasnie właśnie wszyscy wszystkich wszystkie wszystkim wszystko wtedy wy z za zaden zadna zadne +zadnych zapewne zawsze ze zeby zeznowu zł znow znowu znów zostal został żaden żadna żadne żadnych że żeby +""" + +arabic = """أنت كليكما اللتان بنا هما إذا اللواتي أينما كلاهما إما كيت إذ هم ليس كيف لك هن لئن ألا عليك وإن إليكما أيها لعل أنتن كأي لسن ممن له +حين اللتين فيها عسى ما هي أين ليسا هنا بما عما هاته ذاك لدى هاك نحو بكم ذواتا هذا أقل اللتيا إن مع لكما بكما قد لي أولئك إليك أن كلا +ليسوا بس ذات فيه منها ومن هو بها كأنما هاهنا هاتان هذي ذلك كما أوه هكذا ذوا ليست لكي نعم لكن خلا لكم أنا بخ تي فلا حبذا أولاء +ذواتي منذ ولو بين لكنما سوى آها تلك إي آي إذما الذي كليهما لكيلا لهما بعض يا بكن حيثما وإذا بهما ذا ها فيما ماذا والذين لستما كل +لوما ثمة متى عند في هيهات أما ذان الذين وهو أنتم كي آه ذي إذن إليكم بل فإن وإذ تلكما هلا فإذا هذه ذلكم فمن إلا إنا بمن كذلك هاتين +عليه كأن هل ذلكما مهما شتان والذي هيا ذين لستن بك مذ ولا هذين كأين فيم حتى إنما بهن هنالك أم لسنا غير لنا منه نحن اللاتي بعد تينك +ذلكن ولكن كلما إيه عدا لها هذان ته حاشا دون أنى عن تين أكثر كلتا إنه بيد كذا هاتي ذو لست لم إليكن وما مما إلى ذانك اللذين من مه أف +كم اللائي حيث ليستا هؤلاء بماذا ليت هيت بهم لهن التي لولا لو لهم هناك ثم سوف كيفما لستم لما ذينك بلى لا تلكم على لاسيما به بي اللذان أي ذه لن عل أو ريث أنتما +""" + +LANGUAGES = { + "danish": danish, + "english": english, + "german": german, + "spanish": spanish, + "portuguese": portuguese, + "swedish": swedish, + "italian": italian, + "polish": polish, + "arabic": arabic +} + + +def get_stopwords_by_language(language): + if language in LANGUAGES: + return LANGUAGES[language] + return "" diff --git a/summa/preprocessing/textcleaner.py b/summa/preprocessing/textcleaner.py new file mode 100644 index 0000000..9716774 --- /dev/null +++ b/summa/preprocessing/textcleaner.py @@ -0,0 +1,188 @@ +import string +import unicodedata +import logging + +logger = logging.getLogger('summa.preprocessing.cleaner') + +try: + from pattern.en import tag + logger.info("'pattern' package found; tag filters are available for English") + HAS_PATTERN = True +except ImportError: + logger.info("'pattern' package not found; tag filters are not available for English") + HAS_PATTERN = False + +import re + +from .snowball import SnowballStemmer +from .stopwords import get_stopwords_by_language +from summa.syntactic_unit import SyntacticUnit + + +# Utility functions adapted from Gensim v0.10.0: +# https://github.com/RaRe-Technologies/gensim/blob/0.10.0/gensim/utils.py +# https://github.com/RaRe-Technologies/gensim/blob/0.10.0/gensim/parsing/preprocessing.py + + +SEPARATOR = r"@" +RE_SENTENCE = re.compile('(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)') +AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)\s(\w)") +AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)\s(\w)") +AB_ACRONYM_LETTERS = re.compile("([a-zA-Z])\.([a-zA-Z])\.") +UNDO_AB_SENIOR = re.compile("([A-Z][a-z]{1,2}\.)" + SEPARATOR + "(\w)") +UNDO_AB_ACRONYM = re.compile("(\.[a-zA-Z]\.)" + SEPARATOR + "(\w)") + +STEMMER = None +STOPWORDS = None + + +def set_stemmer_language(language): + global STEMMER + if not language in SnowballStemmer.languages: + raise ValueError("Valid languages are: " + ", ".join(sorted(SnowballStemmer.languages))) + STEMMER = SnowballStemmer(language) + + +def set_stopwords_by_language(language, additional_stopwords): + global STOPWORDS + words = get_stopwords_by_language(language) + if not additional_stopwords: + additional_stopwords = {} + STOPWORDS = frozenset({ w for w in words.split() if w } | { w for w in additional_stopwords if w }) + + +def init_textcleanner(language, additional_stopwords): + set_stemmer_language(language) + set_stopwords_by_language(language, additional_stopwords) + + +def split_sentences(text): + processed = replace_abbreviations(text) + return [undo_replacement(sentence) for sentence in get_sentences(processed)] + + +def replace_abbreviations(text): + return replace_with_separator(text, SEPARATOR, [AB_SENIOR, AB_ACRONYM]) + + +def undo_replacement(sentence): + return replace_with_separator(sentence, r" ", [UNDO_AB_SENIOR, UNDO_AB_ACRONYM]) + + +def replace_with_separator(text, separator, regexs): + replacement = r"\1" + separator + r"\2" + result = text + for regex in regexs: + result = regex.sub(replacement, result) + return result + + +def get_sentences(text): + for match in RE_SENTENCE.finditer(text): + yield match.group() + + +# Taken from Gensim +RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE) +def strip_punctuation(s): + return RE_PUNCT.sub(" ", s) + + +# Taken from Gensim +RE_NUMERIC = re.compile(r"[0-9]+", re.UNICODE) +def strip_numeric(s): + return RE_NUMERIC.sub("", s) + + +def remove_stopwords(sentence): + return " ".join(w for w in sentence.split() if w not in STOPWORDS) + + +def stem_sentence(sentence): + word_stems = [STEMMER.stem(word) for word in sentence.split()] + return " ".join(word_stems) + + +def apply_filters(sentence, filters): + for f in filters: + sentence = f(sentence) + return sentence + + +def filter_words(sentences): + filters = [lambda x: x.lower(), strip_numeric, strip_punctuation, remove_stopwords, + stem_sentence] + apply_filters_to_token = lambda token: apply_filters(token, filters) + return list(map(apply_filters_to_token, sentences)) + + +# Taken from Gensim +def deaccent(text): + """ + Remove accentuation from the given string. + """ + norm = unicodedata.normalize("NFD", text) + result = "".join(ch for ch in norm if unicodedata.category(ch) != 'Mn') + return unicodedata.normalize("NFC", result) + + +# Taken from Gensim +PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE) +def tokenize(text, lowercase=False, deacc=False): + """ + Iteratively yield tokens as unicode strings, optionally also lowercasing them + and removing accent marks. + """ + if lowercase: + text = text.lower() + if deacc: + text = deaccent(text) + for match in PAT_ALPHABETIC.finditer(text): + yield match.group() + + +def merge_syntactic_units(original_units, filtered_units, tags=None): + units = [] + for i in range(len(original_units)): + if filtered_units[i] == '': + continue + + text = original_units[i] + token = filtered_units[i] + tag = tags[i][1] if tags else None + sentence = SyntacticUnit(text, token, tag) + sentence.index = i + + units.append(sentence) + + return units + + +def clean_text_by_sentences(text, language="english", additional_stopwords=None): + """ Tokenizes a given text into sentences, applying filters and lemmatizing them. + Returns a SyntacticUnit list. """ + init_textcleanner(language, additional_stopwords) + original_sentences = split_sentences(text) + filtered_sentences = filter_words(original_sentences) + + return merge_syntactic_units(original_sentences, filtered_sentences) + + +def clean_text_by_word(text, language="english", deacc=False, additional_stopwords=None): + """ Tokenizes a given text into words, applying filters and lemmatizing them. + Returns a dict of word -> syntacticUnit. """ + init_textcleanner(language, additional_stopwords) + text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) + original_words = list(tokenize(text_without_acronyms, lowercase=True, deacc=deacc)) + filtered_words = filter_words(original_words) + if HAS_PATTERN: + tags = tag(" ".join(original_words)) # tag needs the context of the words in the text + else: + tags = None + units = merge_syntactic_units(original_words, filtered_words, tags) + return { unit.text : unit for unit in units } + + +def tokenize_by_word(text, deacc=False): + text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) + return tokenize(text_without_acronyms, lowercase=True, deacc=deacc) diff --git a/summa/preprocessing/util.py b/summa/preprocessing/util.py new file mode 100644 index 0000000..0daad9d --- /dev/null +++ b/summa/preprocessing/util.py @@ -0,0 +1,24 @@ +# Natural Language Toolkit: Stemmer Utilities +# +# Copyright (C) 2001-2019 NLTK Project +# Author: Helder +# URL: +# For license information, see LICENSE.TXT + + +def suffix_replace(original, old, new): + """ + Replaces the old suffix of the original string by a new suffix + """ + return original[: -len(old)] + new + + +def prefix_replace(original, old, new): + """ + Replaces the old prefix of the original string by a new suffix + :param original: string + :param old: string + :param new: string + :return: string + """ + return new + original[len(old) :] diff --git a/summa/summarizer.py b/summa/summarizer.py new file mode 100644 index 0000000..952625e --- /dev/null +++ b/summa/summarizer.py @@ -0,0 +1,154 @@ +from math import log10 + +from .pagerank_weighted import pagerank_weighted_scipy as _pagerank +from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences +from .commons import build_graph as _build_graph +from .commons import remove_unreachable_nodes as _remove_unreachable_nodes + + +def _set_graph_edge_weights(graph): + for sentence_1 in graph.nodes(): + for sentence_2 in graph.nodes(): + + edge = (sentence_1, sentence_2) + if sentence_1 != sentence_2 and not graph.has_edge(edge): + similarity = _get_similarity(sentence_1, sentence_2) + if similarity != 0: + graph.add_edge(edge, similarity) + + # Handles the case in which all similarities are zero. + # The resultant summary will consist of random sentences. + if all(graph.edge_weight(edge) == 0 for edge in graph.edges()): + _create_valid_graph(graph) + + +def _create_valid_graph(graph): + nodes = graph.nodes() + + for i in range(len(nodes)): + for j in range(len(nodes)): + if i == j: + continue + + edge = (nodes[i], nodes[j]) + + if graph.has_edge(edge): + graph.del_edge(edge) + + graph.add_edge(edge, 1) + + +def _get_similarity(s1, s2): + words_sentence_one = s1.split() + words_sentence_two = s2.split() + + common_word_count = _count_common_words(words_sentence_one, words_sentence_two) + + log_s1 = log10(len(words_sentence_one)) + log_s2 = log10(len(words_sentence_two)) + + if log_s1 + log_s2 == 0: + return 0 + + return common_word_count / (log_s1 + log_s2) + + +def _count_common_words(words_sentence_one, words_sentence_two): + return len(set(words_sentence_one) & set(words_sentence_two)) + + +def _format_results(extracted_sentences, split, score): + if score: + return [(sentence.text, sentence.score) for sentence in extracted_sentences] + if split: + return [sentence.text for sentence in extracted_sentences] + return "\n".join([sentence.text for sentence in extracted_sentences]) + + +def _add_scores_to_sentences(sentences, scores): + for sentence in sentences: + # Adds the score to the object if it has one. + if sentence.token in scores: + sentence.score = scores[sentence.token] + else: + sentence.score = 0 + + +def _get_sentences_with_word_count(sentences, words): + """ Given a list of sentences, returns a list of sentences with a + total word count similar to the word count provided. + """ + word_count = 0 + selected_sentences = [] + # Loops until the word count is reached. + for sentence in sentences: + words_in_sentence = len(sentence.text.split()) + + # Checks if the inclusion of the sentence gives a better approximation + # to the word parameter. + if abs(words - word_count - words_in_sentence) > abs(words - word_count): + return selected_sentences + + selected_sentences.append(sentence) + word_count += words_in_sentence + + return selected_sentences + + +def _extract_most_important_sentences(sentences, ratio, words): + sentences.sort(key=lambda s: s.score, reverse=True) + + # If no "words" option is selected, the number of sentences is + # reduced by the provided ratio. + if words is None: + length = len(sentences) * ratio + return sentences[:int(length)] + + # Else, the ratio is ignored. + else: + return _get_sentences_with_word_count(sentences, words) + + +def summarize(text, ratio=0.2, words=None, language="english", split=False, scores=False, additional_stopwords=None): + if not isinstance(text, str): + raise ValueError("Text parameter must be a Unicode object (str)!") + + # Gets a list of processed sentences. + sentences = _clean_text_by_sentences(text, language, additional_stopwords) + + # Creates the graph and calculates the similarity coefficient for every pair of nodes. + graph = _build_graph([sentence.token for sentence in sentences]) + _set_graph_edge_weights(graph) + + # Remove all nodes with all edges weights equal to zero. + _remove_unreachable_nodes(graph) + + # PageRank cannot be run in an empty graph. + if len(graph.nodes()) == 0: + return [] if split else "" + + # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score + pagerank_scores = _pagerank(graph) + + # Adds the summa scores to the sentence objects. + _add_scores_to_sentences(sentences, pagerank_scores) + + # EDIT: return the whole sentences with scores + return sentences + + # Extracts the most important sentences with the selected criterion. + # extracted_sentences = _extract_most_important_sentences(sentences, ratio, words) + + # Sorts the extracted sentences by apparition order in the original text. + # extracted_sentences.sort(key=lambda s: s.index) + + # return _format_results(extracted_sentences, split, scores) + + +def get_graph(text, language="english"): + sentences = _clean_text_by_sentences(text, language) + + graph = _build_graph([sentence.token for sentence in sentences]) + _set_graph_edge_weights(graph) + + return graph diff --git a/summa/syntactic_unit.py b/summa/syntactic_unit.py new file mode 100644 index 0000000..e5feee7 --- /dev/null +++ b/summa/syntactic_unit.py @@ -0,0 +1,14 @@ +class SyntacticUnit(object): + + def __init__(self, text, token=None, tag=None): + self.text = text + self.token = token + self.tag = tag[:2] if tag else None # just first two letters of tag + self.index = -1 + self.score = -1 + + def __str__(self): + return "Original unit: '" + self.text + "' *-*-*-* " + "Processed unit: '" + self.token + "'" + + def __repr__(self): + return str(self) diff --git a/summa/textrank.py b/summa/textrank.py new file mode 100644 index 0000000..328a131 --- /dev/null +++ b/summa/textrank.py @@ -0,0 +1,97 @@ +import argparse +import os +import sys +import warnings + +from .summarizer import summarize +from .keywords import keywords + +# Types of summarization +SENTENCE = 0 +WORD = 1 + +DEFAULT_RATIO = 0.2 + + +def textrank(text, summarize_by=SENTENCE, ratio=DEFAULT_RATIO, words=None, additional_stopwords=None): + if summarize_by == SENTENCE: + return summarize(text, ratio, words, additional_stopwords=additional_stopwords) + else: + return keywords(text, ratio, words, additional_stopwords=additional_stopwords) + + +def existing_file(file_name): + try: + with open(file_name, 'r') as file: + return file.read() + except Exception: + raise argparse.ArgumentTypeError("The file provided could not be opened.") + + +def restricted_float(x): + x = float(x) + if x < 0.0 or x > 1.0: + raise argparse.ArgumentTypeError("{} not in range [0.0, 1.0]".format(x)) + return x + + +def parse_args(args): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, prog="textrank", description="Extract the most relevant sentences or keywords of a given text using the TextRank algorithm.") + + group = parser.add_mutually_exclusive_group(required=True) + # New API + group.add_argument('--summarize', metavar="path/to/file", type=existing_file, + help="Run textrank to summarize the input text.") + group.add_argument('--keywords', metavar="path/to/file", type=existing_file, + help="Run textrank to extract keywords from the input text.") + # Old API + group.add_argument('--text', '-t', metavar="path/to/file", type=existing_file, + help="(Deprecated) Text to summarize if --summary option is selected") + + parser.add_argument('--summary', '-s', metavar="{0,1}", type=int, choices=[SENTENCE, WORD], default=0, + help="(Deprecated) Type of unit to summarize: sentence (0) or word (1)") + parser.add_argument('--ratio', '-r', metavar="r", type=restricted_float, default=DEFAULT_RATIO, + help="Float number (0,1] that defines the length of the summary. It's a proportion of the original text") + parser.add_argument('--words', '-w', metavar="#words", type=int, + help="Number to limit the length of the summary. The length option is ignored if the word limit is set.") + parser.add_argument('--additional_stopwords', '-a', metavar="list,of,stopwords", + help="Either a string of comma separated stopwords or a path to a file which has comma separated stopwords in every line") + + return parser.parse_args(args) + + +def main(): + args = parse_args(sys.argv[1:]) + + mode = None + text = None + + if args.summarize: + text = args.summarize + mode = SENTENCE + elif args.keywords: + text = args.keywords + mode = WORD + elif args.summary: # Old api + warnings.warn("The --summary option is deprecated. Please use either --summarize or --keywords", DeprecationWarning) + text = args.text + mode = args.summary + + if text is None: + raise argparse.ArgumentTypeError('Error: no text to summarize provided.') + else: + raise argparse.ArgumentTypeError('Error: --summarize or --keywords is required') + + additional_stopwords = None + if args.additional_stopwords: + if os.path.exists(args.additional_stopwords): + with open(args.additional_stopwords) as f: + additional_stopwords = {s for l in f for s in l.strip().split(",")} + else: + additional_stopwords = args.additional_stopwords.split(",") + + print(textrank(text, mode, args.ratio, args.words, additional_stopwords)) + + +if __name__ == "__main__": + main() diff --git a/template.html b/template.html new file mode 100644 index 0000000..504c0eb --- /dev/null +++ b/template.html @@ -0,0 +1,31 @@ + + + + + + + + + TextRank Opacity + + + + + + + + + + + +
+ + {% for s in sentences %} + {{ s.html|safe }} + {% endfor %} + +
+ + + + diff --git a/texts/warehouse.txt b/texts/warehouse.txt new file mode 100644 index 0000000..b16a42e --- /dev/null +++ b/texts/warehouse.txt @@ -0,0 +1 @@ +A warehouse is a building for storing goods. Warehouses are used by manufacturers, importers, exporters, wholesalers, transport businesses, customs, etc. They are usually large plain buildings in industrial parks on the outskirts of cities, towns, or villages. They usually have loading docks to load and unload goods from trucks. Sometimes warehouses are designed for the loading and unloading of goods directly from railways, airports, or seaports. They often have cranes and forklifts for moving goods, which are usually placed on ISO standard pallets and then loaded into pallet racks. Stored goods can include any raw materials, packing materials, spare parts, components, or finished goods associated with agriculture, manufacturing, and production. In India and Hong Kong, a warehouse may be referred to as a "godown". There are also godowns in the Shanghai Bund. \ No newline at end of file diff --git a/www/css/main.css b/www/css/main.css new file mode 100644 index 0000000..46097e8 --- /dev/null +++ b/www/css/main.css @@ -0,0 +1,38 @@ + +:root{ + --lh: 1.35rem; +} + +body{ + margin: var(--lh); + line-height: var(--lh); +} + +@media print{ + body{ + margin: 0; + font-size: 10pt; + } +} + +main{ + max-width: 42rem; + margin: 0 auto; +} + +/* h1,h2,h3,h4,h5,h6{ + line-height: var(--lh); +} */ + +h1{ + text-align: center; + margin: calc(2 * var(--lh)) 0; +} + +h2,h3,h4,h5,h6{ + margin: calc(3 * var(--lh)) 0 var(--lh); +} + +:is(h1,h2,h3,h4,h5,h6) + :is(h1,h2,h3,h4,h5,h6){ + margin-top: var(--lh); +} \ No newline at end of file diff --git a/www/index.html b/www/index.html new file mode 100644 index 0000000..1dec090 --- /dev/null +++ b/www/index.html @@ -0,0 +1,177 @@ + + + + + + + + + TextRank Opacity + + + + + + + + + + + +
+ + +

Sambucus

+ + Sambucus is a genus of flowering plants in the family Adoxaceae. + + The various species are commonly called elder or elderberry. + + The genus was formerly placed in the honeysuckle family, Caprifoliaceae, but was reclassified as Adoxaceae due to genetic and morphological comparisons to plants in the genus Adoxa. + +

Description

+ + The oppositely arranged leaves are pinnate with 5–9 leaflets (or, rarely, 3 or 11). + + Each leaf is 5–30 cm (2–12 in) long, and the leaflets have serrated margins. + + They bear large clusters of small white or cream-colored flowers in late spring; these are followed by clusters of small black, blue-black, or red berries (rarely yellow or white). + +

Color

+ + Sambucus fruit is rich in anthocyanidins that combine to give elderberry juice an intense blue-purple coloration that turns reddish on dilution with water. + + These pigments are used as colorants in various products, and "elderberry juice color" is listed by the US FDA as allowable in certified organic food products. + + In Japan, elderberry juice is listed as an approved "natural color additive" under the Food and Sanitation Law. Fibers can be dyed with elderberry juice (using alum as a mordant) to give a light "elderberry" color. + +

Toxicity

+ + Although the cooked berries (pulp and skin) of most species of Sambucus are edible, the uncooked berries and other parts of plants from this genus are poisonous. + + Leaves, twigs, branches, seeds, roots, flowers, and berries of Sambucus plants produce cyanogenic glycosides, which have toxic properties. + + Ingesting a sufficient quantity of cyanogenic glycosides from berry juice, flower tea, or beverages made from fresh leaves, branches, and fruit has been shown to cause illness, including nausea, vomiting, abdominal cramps, diarrhea, and weakness. + + In August 1983, a group of 25 people in Monterey County, California, became suddenly ill by ingesting elderberry juice pressed from fresh, uncooked Sambucus mexicana berries, leaves, and stems. + + The density of cyanogenic glycosides is higher in tea made from flowers (or leaves) than from the berries.The seeds of Sambucus callicarpa are reported to be poisonous and may cause vomiting or diarrhea. + +

Taxonomy

+ + The taxonomy of the genus Sambucus L., originally described by Carl Linnaeus and hence its botanical authority, has been complicated by its wide geographical distribution and morphological diversity. + + This has led to overdescription of the species and infraspecific taxa (subspecies, varieties or forms). + + The name comes from the Greek word sambuce, an ancient wind instrument, about the removal of pith from the twigs to make whistles.Species recognized in this genus are: + +

Distribution and habitat

+ + The genus occurs in temperate to subtropical regions of the world. + + More widespread in the Northern Hemisphere, its Southern Hemisphere occurrence is restricted to parts of Australasia and South America. + + Many species are widely cultivated for their ornamental leaves, flowers, and fruit. + +

Habitat

+ + Elder commonly grows near farms and homesteads. + + It is a nitrogen-dependent plant and thus is generally found near places of organic waste disposal. + + Elders are often grown as a hedgerow plant in Britain since they take very fast, can be bent into shape easily, and grow quite profusely, thus having gained the reputation of being 'an instant hedge'. + + It is not generally affected by soil type or pH level and will virtually grow anywhere sufficient sunlight is available. + +

Ecology

+ + In Northern California, elderberries are a food for migrating band-tailed pigeons. + + Elders are used as food plants by the larvae of some Lepidoptera species including brown-tail, buff ermine, dot moth, emperor moth, engrailed moth, swallow-tailed moth and the V-pug. + + The crushed foliage and immature fruit have a strong fetid smell. + + Valley elderberry longhorn beetles in California are very often found around red or blue elderberry bushes. + + Females lay their eggs on the bark. + + The pith of elder has been used by watchmakers for cleaning tools before intricate work. + +

Cultivation

+ + Traditional uses of Sambucus involved berries, seeds, leaves, and flowers or component extracts. + + Ornamental varieties of Sambucus are grown in gardens for their showy flowers, fruits and lacy foliage which support habitat for wildlife. + + Of the many native species, three are used as ornamentals, S. + + nigra, S. + + canadensis and S. + + racemosa. + +

Uses

+ +

Nutrition

+ + Raw elderberries are 80% water, 18% carbohydrates, and less than 1% each of protein and fat (table). + + In a 100-gram (3+1⁄2 oz) amount, elderberries supply 305 kilojoules (73 kcal) of food energy and are a rich source of vitamin C, providing 43% of the Daily Value (DV). + + Elderberries also have moderate contents of vitamin B6 (18% DV) and iron (12% DV), with no other nutrients in significant content. + +

Dietary supplement

+ + Elderberry fruit or flowers are used as dietary supplements to prevent or provide relief from minor diseases, such as flu, colds, constipation, and other conditions, served as a tea, extract or in a capsule. + + The use of elderberry supplements increased early in the COVID-19 pandemic. + + There is insufficient research to establish its effectiveness for such uses, or its safety profile. + + The raw or unripe fruit of S. + + nigra or its extracts may contain a cyanogenic glycoside that is potentially toxic. + +

Traditional medicine

+ + Although practitioners of traditional medicine have used elderberry over centuries, there is no high-quality clinical evidence that such practices provide any benefit. + + The flowers of Sambucus nigra are used to produce elderflower cordial. + + St-Germain, a French liqueur, is made from elderflowers. + + Hallands Fläder, a Swedish akvavit, is flavoured with elderflowers. + + Hollowed elderberry twigs have traditionally been used as spiles to tap maple trees for syrup. + + Additionally, they have been hollowed out and used as flutes, blowguns, and syringes.The fruit of S. + + callicarpa is eaten by birds and mammals. + + It is inedible to humans when raw but can be made into wine.Elderberry twigs and fruit are employed in creating dyes for basketry. + + These stems are dyed a very deep black by soaking them in a wash made from the berry stems of the elderberry. + +

In popular culture

+ + Folklore related to elder trees is extensive and can vary according to region. + + In some traditions, the elder tree is thought to ward off evil and give protection from witches, while other beliefs say that witches often congregate under the plant, especially when it is full of fruit. + + If an elder tree was cut down, a spirit known as the Elder Mother would be released and take her revenge. + + The tree could only safely be cut while chanting a rhyme to the Elder Mother.Made from the branch of an elder tree, the Elder Wand plays a pivotal role in the final book of the Harry Potter series, which was nearly named Harry Potter and the Elder Wand before author J. + + K. Rowling decided on Harry Potter and the Deathly Hallows.Elton John's 1973 album Don't Shoot Me I'm Only the Piano Player features a song titled "Elderberry Wine". + + In Monty Python and the Holy Grail, John Cleese as the French Taunter tells the knights of Camelot, "Your mother was a hamster, and your father smelt of elderberries." + +

Gallery

+ + +
+ + + + \ No newline at end of file