add nltk_data

2026-01-19 13:23:16 +08:00 · 2023-04-16 23:38:25 +08:00 · 2023-04-16 23:38:25 +08:00 · 4aadcfa042
commit 4aadcfa042
parent 033e8959ca
45 changed files with 2463210 additions and 0 deletions
--- a/cli_demo.py
+++ b/cli_demo.py
@ -1,5 +1,9 @@
 from configs.model_config import *
 from chains.local_doc_qa import LocalDocQA
+import os
+import nltk
+
+nltk.data.path = [os.path.join(os.path.dirname(__file__), "nltk_data")] + nltk.data.path

 # return top-k text chunk from vector store
 VECTOR_SEARCH_TOP_K = 10
--- a/nltk_data/corpora/cmudict/README
+++ b/nltk_data/corpora/cmudict/README
@ -0,0 +1,76 @@
+The Carnegie Mellon Pronouncing Dictionary [cmudict.0.7a]
+
+ftp://ftp.cs.cmu.edu/project/speech/dict/
+https://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/trunk/cmudict/cmudict.0.7a
+
+Copyright (C) 1993-2008 Carnegie Mellon University. All rights reserved.
+
+File Format: Each line consists of an uppercased word,
+a counter (for alternative pronunciations), and a transcription.
+Vowels are marked for stress (1=primary, 2=secondary, 0=no stress).
+E.g.: NATURAL 1 N AE1 CH ER0 AH0 L
+
+The dictionary contains 127069 entries.  Of these, 119400 words are assigned
+a unique pronunciation, 6830 words have two pronunciations, and 839 words have
+three or more pronunciations.  Many of these are fast-speech variants.
+
+Phonemes: There are 39 phonemes, as shown below:
+    
+    Phoneme Example Translation    Phoneme Example Translation
+    ------- ------- -----------    ------- ------- -----------
+    AA      odd     AA D           AE      at      AE T
+    AH      hut     HH AH T        AO      ought   AO T
+    AW      cow     K AW           AY      hide    HH AY D
+    B       be      B IY           CH      cheese  CH IY Z
+    D       dee     D IY           DH      thee    DH IY
+    EH      Ed      EH D           ER      hurt    HH ER T
+    EY      ate     EY T           F       fee     F IY
+    G       green   G R IY N       HH      he      HH IY
+    IH      it      IH T           IY      eat     IY T
+    JH      gee     JH IY          K       key     K IY
+    L       lee     L IY           M       me      M IY
+    N       knee    N IY           NG      ping    P IH NG
+    OW      oat     OW T           OY      toy     T OY
+    P       pee     P IY           R       read    R IY D
+    S       sea     S IY           SH      she     SH IY
+    T       tea     T IY           TH      theta   TH EY T AH
+    UH      hood    HH UH D        UW      two     T UW
+    V       vee     V IY           W       we      W IY
+    Y       yield   Y IY L D       Z       zee     Z IY
+    ZH      seizure S IY ZH ER
+
+(For NLTK, entries have been sorted so that, e.g. FIRE 1 and FIRE 2
+are contiguous, and not separated by FIRE'S 1.)
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   The contents of this file are deemed to be source code.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in
+   the documentation and/or other materials provided with the
+   distribution.
+
+This work was supported in part by funding from the Defense Advanced
+Research Projects Agency, the Office of Naval Research and the National
+Science Foundation of the United States of America, and by member
+companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge
+the contributions of many volunteers to the expansion and improvement of
+this dictionary.
+
+THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
+ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
+NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
--- a/nltk_data/corpora/cmudict/cmudict
+++ b/nltk_data/corpora/cmudict/cmudict
--- a/nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle
+++ b/nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle
--- a/nltk_data/tokenizers/punkt/PY3/README
+++ b/nltk_data/tokenizers/punkt/PY3/README
@ -0,0 +1,98 @@
+Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
+
+Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
+been contributed by various people using NLTK for sentence boundary detection.
+
+For information about how to use these models, please confer the tokenization HOWTO:
+http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
+and chapter 3.8 of the NLTK book:
+http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
+
+There are pretrained tokenizers for the following languages:
+
+File                Language            Source                             Contents                Size of training corpus(in tokens)           Model contributed by
+=======================================================================================================================================================================
+czech.pickle        Czech               Multilingual Corpus 1 (ECI)        Lidove Noviny                   ~345,000                             Jan Strunk / Tibor Kiss
+                                                                           Literarni Noviny
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+danish.pickle       Danish              Avisdata CD-Rom Ver. 1.1. 1995     Berlingske Tidende              ~550,000                             Jan Strunk / Tibor Kiss
+                                        (Berlingske Avisdata, Copenhagen)  Weekend Avisen
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+dutch.pickle        Dutch               Multilingual Corpus 1 (ECI)        De Limburger                    ~340,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+english.pickle      English             Penn Treebank (LDC)                Wall Street Journal             ~469,000                             Jan Strunk / Tibor Kiss
+                    (American)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+estonian.pickle     Estonian            University of Tartu, Estonia       Eesti Ekspress                  ~359,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+finnish.pickle      Finnish             Finnish Parole Corpus, Finnish     Books and major national        ~364,000                             Jan Strunk / Tibor Kiss
+                                        Text Bank (Suomen Kielen           newspapers
+                                        Tekstipankki)
+                                        Finnish Center for IT Science
+                                        (CSC)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+french.pickle       French              Multilingual Corpus 1 (ECI)        Le Monde                        ~370,000                             Jan Strunk / Tibor Kiss
+                    (European)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+german.pickle       German              Neue Zürcher Zeitung AG            Neue Zürcher Zeitung            ~847,000                             Jan Strunk / Tibor Kiss
+                    (Switzerland)       CD-ROM
+                    (Uses "ss"
+                     instead of "ß")
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+greek.pickle        Greek               Efstathios Stamatatos              To Vima (TO BHMA)               ~227,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+italian.pickle      Italian             Multilingual Corpus 1 (ECI)        La Stampa, Il Mattino           ~312,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+norwegian.pickle    Norwegian           Centre for Humanities              Bergens Tidende                 ~479,000                             Jan Strunk / Tibor Kiss
+                    (Bokmål and         Information Technologies,
+                     Nynorsk)           Bergen
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+polish.pickle       Polish              Polish National Corpus             Literature, newspapers, etc.  ~1,000,000                             Krzysztof Langner
+                                        (http://www.nkjp.pl/)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+portuguese.pickle   Portuguese          CETENFolha Corpus                  Folha de São Paulo              ~321,000                             Jan Strunk / Tibor Kiss
+                    (Brazilian)         (Linguateca)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+slovene.pickle      Slovene             TRACTOR                            Delo                            ~354,000                             Jan Strunk / Tibor Kiss
+                                        Slovene Academy for Arts
+                                        and Sciences
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+spanish.pickle      Spanish             Multilingual Corpus 1 (ECI)        Sur                             ~353,000                             Jan Strunk / Tibor Kiss
+                    (European)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+swedish.pickle      Swedish             Multilingual Corpus 1 (ECI)        Dagens Nyheter                  ~339,000                             Jan Strunk / Tibor Kiss
+                                                                           (and some other texts)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+turkish.pickle      Turkish             METU Turkish Corpus                Milliyet                        ~333,000                             Jan Strunk / Tibor Kiss
+                                        (Türkçe Derlem Projesi)
+                                        University of Ankara
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
+Unicode using the codecs module.
+
+Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
+Computational Linguistics 32: 485-525.
+
+---- Training Code ----
+
+# import punkt
+import nltk.tokenize.punkt
+
+# Make a new Tokenizer
+tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
+
+# Read in training corpus (one example: Slovene)
+import codecs
+text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
+
+# Train tokenizer
+tokenizer.train(text)
+
+# Dump pickled tokenizer
+import pickle
+out = open("slovene.pickle","wb")
+pickle.dump(tokenizer, out)
+out.close()
+
+---------
--- a/nltk_data/tokenizers/punkt/PY3/czech.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/czech.pickle
--- a/nltk_data/tokenizers/punkt/PY3/danish.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/danish.pickle
--- a/nltk_data/tokenizers/punkt/PY3/dutch.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/dutch.pickle
--- a/nltk_data/tokenizers/punkt/PY3/english.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/english.pickle
--- a/nltk_data/tokenizers/punkt/PY3/estonian.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/estonian.pickle
--- a/nltk_data/tokenizers/punkt/PY3/finnish.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/finnish.pickle
--- a/nltk_data/tokenizers/punkt/PY3/french.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/french.pickle
--- a/nltk_data/tokenizers/punkt/PY3/german.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/german.pickle
--- a/nltk_data/tokenizers/punkt/PY3/greek.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/greek.pickle
--- a/nltk_data/tokenizers/punkt/PY3/italian.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/italian.pickle
--- a/nltk_data/tokenizers/punkt/PY3/malayalam.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/malayalam.pickle
--- a/nltk_data/tokenizers/punkt/PY3/norwegian.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/norwegian.pickle
--- a/nltk_data/tokenizers/punkt/PY3/polish.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/polish.pickle
--- a/nltk_data/tokenizers/punkt/PY3/portuguese.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/portuguese.pickle
--- a/nltk_data/tokenizers/punkt/PY3/russian.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/russian.pickle
--- a/nltk_data/tokenizers/punkt/PY3/slovene.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/slovene.pickle
--- a/nltk_data/tokenizers/punkt/PY3/spanish.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/spanish.pickle
--- a/nltk_data/tokenizers/punkt/PY3/swedish.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/swedish.pickle
--- a/nltk_data/tokenizers/punkt/PY3/turkish.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/turkish.pickle
--- a/nltk_data/tokenizers/punkt/README
+++ b/nltk_data/tokenizers/punkt/README
@ -0,0 +1,98 @@
+Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
+
+Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
+been contributed by various people using NLTK for sentence boundary detection.
+
+For information about how to use these models, please confer the tokenization HOWTO:
+http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
+and chapter 3.8 of the NLTK book:
+http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
+
+There are pretrained tokenizers for the following languages:
+
+File                Language            Source                             Contents                Size of training corpus(in tokens)           Model contributed by
+=======================================================================================================================================================================
+czech.pickle        Czech               Multilingual Corpus 1 (ECI)        Lidove Noviny                   ~345,000                             Jan Strunk / Tibor Kiss
+                                                                           Literarni Noviny
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+danish.pickle       Danish              Avisdata CD-Rom Ver. 1.1. 1995     Berlingske Tidende              ~550,000                             Jan Strunk / Tibor Kiss
+                                        (Berlingske Avisdata, Copenhagen)  Weekend Avisen
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+dutch.pickle        Dutch               Multilingual Corpus 1 (ECI)        De Limburger                    ~340,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+english.pickle      English             Penn Treebank (LDC)                Wall Street Journal             ~469,000                             Jan Strunk / Tibor Kiss
+                    (American)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+estonian.pickle     Estonian            University of Tartu, Estonia       Eesti Ekspress                  ~359,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+finnish.pickle      Finnish             Finnish Parole Corpus, Finnish     Books and major national        ~364,000                             Jan Strunk / Tibor Kiss
+                                        Text Bank (Suomen Kielen           newspapers
+                                        Tekstipankki)
+                                        Finnish Center for IT Science
+                                        (CSC)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+french.pickle       French              Multilingual Corpus 1 (ECI)        Le Monde                        ~370,000                             Jan Strunk / Tibor Kiss
+                    (European)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+german.pickle       German              Neue Zürcher Zeitung AG            Neue Zürcher Zeitung            ~847,000                             Jan Strunk / Tibor Kiss
+                    (Switzerland)       CD-ROM
+                    (Uses "ss"
+                     instead of "ß")
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+greek.pickle        Greek               Efstathios Stamatatos              To Vima (TO BHMA)               ~227,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+italian.pickle      Italian             Multilingual Corpus 1 (ECI)        La Stampa, Il Mattino           ~312,000                             Jan Strunk / Tibor Kiss
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+norwegian.pickle    Norwegian           Centre for Humanities              Bergens Tidende                 ~479,000                             Jan Strunk / Tibor Kiss
+                    (Bokmål and         Information Technologies,
+                     Nynorsk)           Bergen
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+polish.pickle       Polish              Polish National Corpus             Literature, newspapers, etc.  ~1,000,000                             Krzysztof Langner
+                                        (http://www.nkjp.pl/)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+portuguese.pickle   Portuguese          CETENFolha Corpus                  Folha de São Paulo              ~321,000                             Jan Strunk / Tibor Kiss
+                    (Brazilian)         (Linguateca)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+slovene.pickle      Slovene             TRACTOR                            Delo                            ~354,000                             Jan Strunk / Tibor Kiss
+                                        Slovene Academy for Arts
+                                        and Sciences
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+spanish.pickle      Spanish             Multilingual Corpus 1 (ECI)        Sur                             ~353,000                             Jan Strunk / Tibor Kiss
+                    (European)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+swedish.pickle      Swedish             Multilingual Corpus 1 (ECI)        Dagens Nyheter                  ~339,000                             Jan Strunk / Tibor Kiss
+                                                                           (and some other texts)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+turkish.pickle      Turkish             METU Turkish Corpus                Milliyet                        ~333,000                             Jan Strunk / Tibor Kiss
+                                        (Türkçe Derlem Projesi)
+                                        University of Ankara
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
+Unicode using the codecs module.
+
+Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
+Computational Linguistics 32: 485-525.
+
+---- Training Code ----
+
+# import punkt
+import nltk.tokenize.punkt
+
+# Make a new Tokenizer
+tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
+
+# Read in training corpus (one example: Slovene)
+import codecs
+text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
+
+# Train tokenizer
+tokenizer.train(text)
+
+# Dump pickled tokenizer
+import pickle
+out = open("slovene.pickle","wb")
+pickle.dump(tokenizer, out)
+out.close()
+
+---------
--- a/nltk_data/tokenizers/punkt/czech.pickle
+++ b/nltk_data/tokenizers/punkt/czech.pickle
--- a/nltk_data/tokenizers/punkt/danish.pickle
+++ b/nltk_data/tokenizers/punkt/danish.pickle
--- a/nltk_data/tokenizers/punkt/dutch.pickle
+++ b/nltk_data/tokenizers/punkt/dutch.pickle
--- a/nltk_data/tokenizers/punkt/english.pickle
+++ b/nltk_data/tokenizers/punkt/english.pickle
--- a/nltk_data/tokenizers/punkt/estonian.pickle
+++ b/nltk_data/tokenizers/punkt/estonian.pickle
--- a/nltk_data/tokenizers/punkt/finnish.pickle
+++ b/nltk_data/tokenizers/punkt/finnish.pickle
--- a/nltk_data/tokenizers/punkt/french.pickle
+++ b/nltk_data/tokenizers/punkt/french.pickle
--- a/nltk_data/tokenizers/punkt/german.pickle
+++ b/nltk_data/tokenizers/punkt/german.pickle
--- a/nltk_data/tokenizers/punkt/greek.pickle
+++ b/nltk_data/tokenizers/punkt/greek.pickle
--- a/nltk_data/tokenizers/punkt/italian.pickle
+++ b/nltk_data/tokenizers/punkt/italian.pickle
--- a/nltk_data/tokenizers/punkt/malayalam.pickle
+++ b/nltk_data/tokenizers/punkt/malayalam.pickle
--- a/nltk_data/tokenizers/punkt/norwegian.pickle
+++ b/nltk_data/tokenizers/punkt/norwegian.pickle
--- a/nltk_data/tokenizers/punkt/polish.pickle
+++ b/nltk_data/tokenizers/punkt/polish.pickle
--- a/nltk_data/tokenizers/punkt/portuguese.pickle
+++ b/nltk_data/tokenizers/punkt/portuguese.pickle
--- a/nltk_data/tokenizers/punkt/russian.pickle
+++ b/nltk_data/tokenizers/punkt/russian.pickle
--- a/nltk_data/tokenizers/punkt/slovene.pickle
+++ b/nltk_data/tokenizers/punkt/slovene.pickle
--- a/nltk_data/tokenizers/punkt/spanish.pickle
+++ b/nltk_data/tokenizers/punkt/spanish.pickle
--- a/nltk_data/tokenizers/punkt/swedish.pickle
+++ b/nltk_data/tokenizers/punkt/swedish.pickle
--- a/nltk_data/tokenizers/punkt/turkish.pickle
+++ b/nltk_data/tokenizers/punkt/turkish.pickle
--- a/webui.py
+++ b/webui.py
@ -3,6 +3,9 @@ import os
 import shutil
 from chains.local_doc_qa import LocalDocQA
 from configs.model_config import *
+import nltk
+
+nltk.data.path = [os.path.join(os.path.dirname(__file__), "nltk_data")] + nltk.data.path

 # return top-k text chunk from vector store
 VECTOR_SEARCH_TOP_K = 6