add nltk_data

2026-02-05 22:33:24 +08:00 · 2023-04-16 23:38:25 +08:00 · 2023-04-16 23:38:25 +08:00 · 4aadcfa042
commit 4aadcfa042
parent 033e8959ca
45 changed files with 2463210 additions and 0 deletions
--- a/cli_demo.py
+++ b/cli_demo.py
@ -1,5 +1,9 @@
 from configs.model_config import *
 from chains.local_doc_qa import LocalDocQA
 import os
 import nltk
 nltk.data.path = [os.path.join(os.path.dirname(__file__), "nltk_data")] + nltk.data.path
 # return top-k text chunk from vector store
 VECTOR_SEARCH_TOP_K = 10
--- a/nltk_data/corpora/cmudict/README
+++ b/nltk_data/corpora/cmudict/README
@ -0,0 +1,76 @@
 The Carnegie Mellon Pronouncing Dictionary [cmudict.0.7a]
 ftp://ftp.cs.cmu.edu/project/speech/dict/
 https://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/trunk/cmudict/cmudict.0.7a
 Copyright (C) 1993-2008 Carnegie Mellon University. All rights reserved.
 File Format: Each line consists of an uppercased word,
 a counter (for alternative pronunciations), and a transcription.
 Vowels are marked for stress (1=primary, 2=secondary, 0=no stress).
 E.g.: NATURAL 1 N AE1 CH ER0 AH0 L
 The dictionary contains 127069 entries.  Of these, 119400 words are assigned
 a unique pronunciation, 6830 words have two pronunciations, and 839 words have
 three or more pronunciations.  Many of these are fast-speech variants.
 Phonemes: There are 39 phonemes, as shown below:
    Phoneme Example Translation    Phoneme Example Translation
    ------- ------- -----------    ------- ------- -----------
    AA      odd     AA D           AE      at      AE T
    AH      hut     HH AH T        AO      ought   AO T
    AW      cow     K AW           AY      hide    HH AY D
    B       be      B IY           CH      cheese  CH IY Z
    D       dee     D IY           DH      thee    DH IY
    EH      Ed      EH D           ER      hurt    HH ER T
    EY      ate     EY T           F       fee     F IY
    G       green   G R IY N       HH      he      HH IY
    IH      it      IH T           IY      eat     IY T
    JH      gee     JH IY          K       key     K IY
    L       lee     L IY           M       me      M IY
    N       knee    N IY           NG      ping    P IH NG
    OW      oat     OW T           OY      toy     T OY
    P       pee     P IY           R       read    R IY D
    S       sea     S IY           SH      she     SH IY
    T       tea     T IY           TH      theta   TH EY T AH
    UH      hood    HH UH D        UW      two     T UW
    V       vee     V IY           W       we      W IY
    Y       yield   Y IY L D       Z       zee     Z IY
    ZH      seizure S IY ZH ER
 (For NLTK, entries have been sorted so that, e.g. FIRE 1 and FIRE 2
 are contiguous, and not separated by FIRE'S 1.)
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
 are met:
 1. Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
   The contents of this file are deemed to be source code.
 2. Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in
   the documentation and/or other materials provided with the
   distribution.
 This work was supported in part by funding from the Defense Advanced
 Research Projects Agency, the Office of Naval Research and the National
 Science Foundation of the United States of America, and by member
 companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge
 the contributions of many volunteers to the expansion and improvement of
 this dictionary.
 THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
 ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
 NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/nltk_data/corpora/cmudict/cmudict
+++ b/nltk_data/corpora/cmudict/cmudict
--- a/nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle
+++ b/nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle
--- a/nltk_data/tokenizers/punkt/PY3/README
+++ b/nltk_data/tokenizers/punkt/PY3/README
@ -0,0 +1,98 @@
 Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
 Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
 been contributed by various people using NLTK for sentence boundary detection.
 For information about how to use these models, please confer the tokenization HOWTO:
 http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
 and chapter 3.8 of the NLTK book:
 http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
 There are pretrained tokenizers for the following languages:
 File                Language            Source                             Contents                Size of training corpus(in tokens)           Model contributed by
 =======================================================================================================================================================================
 czech.pickle        Czech               Multilingual Corpus 1 (ECI)        Lidove Noviny                   ~345,000                             Jan Strunk / Tibor Kiss
                                                                           Literarni Noviny
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 danish.pickle       Danish              Avisdata CD-Rom Ver. 1.1. 1995     Berlingske Tidende              ~550,000                             Jan Strunk / Tibor Kiss
                                        (Berlingske Avisdata, Copenhagen)  Weekend Avisen
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 dutch.pickle        Dutch               Multilingual Corpus 1 (ECI)        De Limburger                    ~340,000                             Jan Strunk / Tibor Kiss
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 english.pickle      English             Penn Treebank (LDC)                Wall Street Journal             ~469,000                             Jan Strunk / Tibor Kiss
                    (American)
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 estonian.pickle     Estonian            University of Tartu, Estonia       Eesti Ekspress                  ~359,000                             Jan Strunk / Tibor Kiss
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 finnish.pickle      Finnish             Finnish Parole Corpus, Finnish     Books and major national        ~364,000                             Jan Strunk / Tibor Kiss
                                        Text Bank (Suomen Kielen           newspapers
                                        Tekstipankki)
                                        Finnish Center for IT Science
                                        (CSC)
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 french.pickle       French              Multilingual Corpus 1 (ECI)        Le Monde                        ~370,000                             Jan Strunk / Tibor Kiss
                    (European)
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 german.pickle       German              Neue Zürcher Zeitung AG            Neue Zürcher Zeitung            ~847,000                             Jan Strunk / Tibor Kiss
                    (Switzerland)       CD-ROM
                    (Uses "ss"
                     instead of "ß")
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 greek.pickle        Greek               Efstathios Stamatatos              To Vima (TO BHMA)               ~227,000                             Jan Strunk / Tibor Kiss
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 italian.pickle      Italian             Multilingual Corpus 1 (ECI)        La Stampa, Il Mattino           ~312,000                             Jan Strunk / Tibor Kiss
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 norwegian.pickle    Norwegian           Centre for Humanities              Bergens Tidende                 ~479,000                             Jan Strunk / Tibor Kiss
                    (Bokmål and         Information Technologies,
                     Nynorsk)           Bergen
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 polish.pickle       Polish              Polish National Corpus             Literature, newspapers, etc.  ~1,000,000                             Krzysztof Langner
                                        (http://www.nkjp.pl/)
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 portuguese.pickle   Portuguese          CETENFolha Corpus                  Folha de São Paulo              ~321,000                             Jan Strunk / Tibor Kiss
                    (Brazilian)         (Linguateca)
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 slovene.pickle      Slovene             TRACTOR                            Delo                            ~354,000                             Jan Strunk / Tibor Kiss
                                        Slovene Academy for Arts
                                        and Sciences
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 spanish.pickle      Spanish             Multilingual Corpus 1 (ECI)        Sur                             ~353,000                             Jan Strunk / Tibor Kiss
                    (European)
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 swedish.pickle      Swedish             Multilingual Corpus 1 (ECI)        Dagens Nyheter                  ~339,000                             Jan Strunk / Tibor Kiss
                                                                           (and some other texts)
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 turkish.pickle      Turkish             METU Turkish Corpus                Milliyet                        ~333,000                             Jan Strunk / Tibor Kiss
                                        (Türkçe Derlem Projesi)
                                        University of Ankara
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
 Unicode using the codecs module.
 Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
 Computational Linguistics 32: 485-525.
 ---- Training Code ----
 # import punkt
 import nltk.tokenize.punkt
 # Make a new Tokenizer
 tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
 # Read in training corpus (one example: Slovene)
 import codecs
 text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
 # Train tokenizer
 tokenizer.train(text)
 # Dump pickled tokenizer
 import pickle
 out = open("slovene.pickle","wb")
 pickle.dump(tokenizer, out)
 out.close()
 ---------
--- a/nltk_data/tokenizers/punkt/PY3/czech.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/czech.pickle
--- a/nltk_data/tokenizers/punkt/PY3/danish.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/danish.pickle
--- a/nltk_data/tokenizers/punkt/PY3/dutch.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/dutch.pickle
--- a/nltk_data/tokenizers/punkt/PY3/english.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/english.pickle
--- a/nltk_data/tokenizers/punkt/PY3/estonian.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/estonian.pickle
--- a/nltk_data/tokenizers/punkt/PY3/finnish.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/finnish.pickle
--- a/nltk_data/tokenizers/punkt/PY3/french.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/french.pickle
--- a/nltk_data/tokenizers/punkt/PY3/german.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/german.pickle
--- a/nltk_data/tokenizers/punkt/PY3/greek.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/greek.pickle
--- a/nltk_data/tokenizers/punkt/PY3/italian.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/italian.pickle
--- a/nltk_data/tokenizers/punkt/PY3/malayalam.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/malayalam.pickle
--- a/nltk_data/tokenizers/punkt/PY3/norwegian.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/norwegian.pickle
--- a/nltk_data/tokenizers/punkt/PY3/polish.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/polish.pickle
--- a/nltk_data/tokenizers/punkt/PY3/portuguese.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/portuguese.pickle
--- a/nltk_data/tokenizers/punkt/PY3/russian.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/russian.pickle
--- a/nltk_data/tokenizers/punkt/PY3/slovene.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/slovene.pickle
--- a/nltk_data/tokenizers/punkt/PY3/spanish.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/spanish.pickle
--- a/nltk_data/tokenizers/punkt/PY3/swedish.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/swedish.pickle
--- a/nltk_data/tokenizers/punkt/PY3/turkish.pickle
+++ b/nltk_data/tokenizers/punkt/PY3/turkish.pickle
--- a/nltk_data/tokenizers/punkt/README
+++ b/nltk_data/tokenizers/punkt/README
@ -0,0 +1,98 @@
 Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
 Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
 been contributed by various people using NLTK for sentence boundary detection.
 For information about how to use these models, please confer the tokenization HOWTO:
 http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
 and chapter 3.8 of the NLTK book:
 http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
 There are pretrained tokenizers for the following languages:
 File                Language            Source                             Contents                Size of training corpus(in tokens)           Model contributed by
 =======================================================================================================================================================================
 czech.pickle        Czech               Multilingual Corpus 1 (ECI)        Lidove Noviny                   ~345,000                             Jan Strunk / Tibor Kiss
                                                                           Literarni Noviny
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 danish.pickle       Danish              Avisdata CD-Rom Ver. 1.1. 1995     Berlingske Tidende              ~550,000                             Jan Strunk / Tibor Kiss
                                        (Berlingske Avisdata, Copenhagen)  Weekend Avisen
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 dutch.pickle        Dutch               Multilingual Corpus 1 (ECI)        De Limburger                    ~340,000                             Jan Strunk / Tibor Kiss
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 english.pickle      English             Penn Treebank (LDC)                Wall Street Journal             ~469,000                             Jan Strunk / Tibor Kiss
                    (American)
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 estonian.pickle     Estonian            University of Tartu, Estonia       Eesti Ekspress                  ~359,000                             Jan Strunk / Tibor Kiss
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 finnish.pickle      Finnish             Finnish Parole Corpus, Finnish     Books and major national        ~364,000                             Jan Strunk / Tibor Kiss
                                        Text Bank (Suomen Kielen           newspapers
                                        Tekstipankki)
                                        Finnish Center for IT Science
                                        (CSC)
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 french.pickle       French              Multilingual Corpus 1 (ECI)        Le Monde                        ~370,000                             Jan Strunk / Tibor Kiss
                    (European)
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 german.pickle       German              Neue Zürcher Zeitung AG            Neue Zürcher Zeitung            ~847,000                             Jan Strunk / Tibor Kiss
                    (Switzerland)       CD-ROM
                    (Uses "ss"
                     instead of "ß")
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 greek.pickle        Greek               Efstathios Stamatatos              To Vima (TO BHMA)               ~227,000                             Jan Strunk / Tibor Kiss
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 italian.pickle      Italian             Multilingual Corpus 1 (ECI)        La Stampa, Il Mattino           ~312,000                             Jan Strunk / Tibor Kiss
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 norwegian.pickle    Norwegian           Centre for Humanities              Bergens Tidende                 ~479,000                             Jan Strunk / Tibor Kiss
                    (Bokmål and         Information Technologies,
                     Nynorsk)           Bergen
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 polish.pickle       Polish              Polish National Corpus             Literature, newspapers, etc.  ~1,000,000                             Krzysztof Langner
                                        (http://www.nkjp.pl/)
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 portuguese.pickle   Portuguese          CETENFolha Corpus                  Folha de São Paulo              ~321,000                             Jan Strunk / Tibor Kiss
                    (Brazilian)         (Linguateca)
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 slovene.pickle      Slovene             TRACTOR                            Delo                            ~354,000                             Jan Strunk / Tibor Kiss
                                        Slovene Academy for Arts
                                        and Sciences
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 spanish.pickle      Spanish             Multilingual Corpus 1 (ECI)        Sur                             ~353,000                             Jan Strunk / Tibor Kiss
                    (European)
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 swedish.pickle      Swedish             Multilingual Corpus 1 (ECI)        Dagens Nyheter                  ~339,000                             Jan Strunk / Tibor Kiss
                                                                           (and some other texts)
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 turkish.pickle      Turkish             METU Turkish Corpus                Milliyet                        ~333,000                             Jan Strunk / Tibor Kiss
                                        (Türkçe Derlem Projesi)
                                        University of Ankara
 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
 Unicode using the codecs module.
 Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
 Computational Linguistics 32: 485-525.
 ---- Training Code ----
 # import punkt
 import nltk.tokenize.punkt
 # Make a new Tokenizer
 tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
 # Read in training corpus (one example: Slovene)
 import codecs
 text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
 # Train tokenizer
 tokenizer.train(text)
 # Dump pickled tokenizer
 import pickle
 out = open("slovene.pickle","wb")
 pickle.dump(tokenizer, out)
 out.close()
 ---------
--- a/nltk_data/tokenizers/punkt/czech.pickle
+++ b/nltk_data/tokenizers/punkt/czech.pickle
--- a/nltk_data/tokenizers/punkt/danish.pickle
+++ b/nltk_data/tokenizers/punkt/danish.pickle
--- a/nltk_data/tokenizers/punkt/dutch.pickle
+++ b/nltk_data/tokenizers/punkt/dutch.pickle
--- a/nltk_data/tokenizers/punkt/english.pickle
+++ b/nltk_data/tokenizers/punkt/english.pickle
--- a/nltk_data/tokenizers/punkt/estonian.pickle
+++ b/nltk_data/tokenizers/punkt/estonian.pickle
--- a/nltk_data/tokenizers/punkt/finnish.pickle
+++ b/nltk_data/tokenizers/punkt/finnish.pickle
--- a/nltk_data/tokenizers/punkt/french.pickle
+++ b/nltk_data/tokenizers/punkt/french.pickle
--- a/nltk_data/tokenizers/punkt/german.pickle
+++ b/nltk_data/tokenizers/punkt/german.pickle
--- a/nltk_data/tokenizers/punkt/greek.pickle
+++ b/nltk_data/tokenizers/punkt/greek.pickle
--- a/nltk_data/tokenizers/punkt/italian.pickle
+++ b/nltk_data/tokenizers/punkt/italian.pickle
--- a/nltk_data/tokenizers/punkt/malayalam.pickle
+++ b/nltk_data/tokenizers/punkt/malayalam.pickle
--- a/nltk_data/tokenizers/punkt/norwegian.pickle
+++ b/nltk_data/tokenizers/punkt/norwegian.pickle
--- a/nltk_data/tokenizers/punkt/polish.pickle
+++ b/nltk_data/tokenizers/punkt/polish.pickle
--- a/nltk_data/tokenizers/punkt/portuguese.pickle
+++ b/nltk_data/tokenizers/punkt/portuguese.pickle
--- a/nltk_data/tokenizers/punkt/russian.pickle
+++ b/nltk_data/tokenizers/punkt/russian.pickle
--- a/nltk_data/tokenizers/punkt/slovene.pickle
+++ b/nltk_data/tokenizers/punkt/slovene.pickle
--- a/nltk_data/tokenizers/punkt/spanish.pickle
+++ b/nltk_data/tokenizers/punkt/spanish.pickle
--- a/nltk_data/tokenizers/punkt/swedish.pickle
+++ b/nltk_data/tokenizers/punkt/swedish.pickle
--- a/nltk_data/tokenizers/punkt/turkish.pickle
+++ b/nltk_data/tokenizers/punkt/turkish.pickle
--- a/webui.py
+++ b/webui.py
@ -3,6 +3,9 @@ import os
 import shutil
 from chains.local_doc_qa import LocalDocQA
 from configs.model_config import *
 import nltk
 nltk.data.path = [os.path.join(os.path.dirname(__file__), "nltk_data")] + nltk.data.path
 # return top-k text chunk from vector store
 VECTOR_SEARCH_TOP_K = 6