mirror of
https://github.com/RYDE-WORK/Langchain-Chatchat.git
synced 2026-02-05 22:33:24 +08:00
add nltk_data
This commit is contained in:
parent
033e8959ca
commit
4aadcfa042
@ -1,5 +1,9 @@
|
|||||||
from configs.model_config import *
|
from configs.model_config import *
|
||||||
from chains.local_doc_qa import LocalDocQA
|
from chains.local_doc_qa import LocalDocQA
|
||||||
|
import os
|
||||||
|
import nltk
|
||||||
|
|
||||||
|
nltk.data.path = [os.path.join(os.path.dirname(__file__), "nltk_data")] + nltk.data.path
|
||||||
|
|
||||||
# return top-k text chunk from vector store
|
# return top-k text chunk from vector store
|
||||||
VECTOR_SEARCH_TOP_K = 10
|
VECTOR_SEARCH_TOP_K = 10
|
||||||
|
|||||||
76
nltk_data/corpora/cmudict/README
Normal file
76
nltk_data/corpora/cmudict/README
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
The Carnegie Mellon Pronouncing Dictionary [cmudict.0.7a]
|
||||||
|
|
||||||
|
ftp://ftp.cs.cmu.edu/project/speech/dict/
|
||||||
|
https://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/trunk/cmudict/cmudict.0.7a
|
||||||
|
|
||||||
|
Copyright (C) 1993-2008 Carnegie Mellon University. All rights reserved.
|
||||||
|
|
||||||
|
File Format: Each line consists of an uppercased word,
|
||||||
|
a counter (for alternative pronunciations), and a transcription.
|
||||||
|
Vowels are marked for stress (1=primary, 2=secondary, 0=no stress).
|
||||||
|
E.g.: NATURAL 1 N AE1 CH ER0 AH0 L
|
||||||
|
|
||||||
|
The dictionary contains 127069 entries. Of these, 119400 words are assigned
|
||||||
|
a unique pronunciation, 6830 words have two pronunciations, and 839 words have
|
||||||
|
three or more pronunciations. Many of these are fast-speech variants.
|
||||||
|
|
||||||
|
Phonemes: There are 39 phonemes, as shown below:
|
||||||
|
|
||||||
|
Phoneme Example Translation Phoneme Example Translation
|
||||||
|
------- ------- ----------- ------- ------- -----------
|
||||||
|
AA odd AA D AE at AE T
|
||||||
|
AH hut HH AH T AO ought AO T
|
||||||
|
AW cow K AW AY hide HH AY D
|
||||||
|
B be B IY CH cheese CH IY Z
|
||||||
|
D dee D IY DH thee DH IY
|
||||||
|
EH Ed EH D ER hurt HH ER T
|
||||||
|
EY ate EY T F fee F IY
|
||||||
|
G green G R IY N HH he HH IY
|
||||||
|
IH it IH T IY eat IY T
|
||||||
|
JH gee JH IY K key K IY
|
||||||
|
L lee L IY M me M IY
|
||||||
|
N knee N IY NG ping P IH NG
|
||||||
|
OW oat OW T OY toy T OY
|
||||||
|
P pee P IY R read R IY D
|
||||||
|
S sea S IY SH she SH IY
|
||||||
|
T tea T IY TH theta TH EY T AH
|
||||||
|
UH hood HH UH D UW two T UW
|
||||||
|
V vee V IY W we W IY
|
||||||
|
Y yield Y IY L D Z zee Z IY
|
||||||
|
ZH seizure S IY ZH ER
|
||||||
|
|
||||||
|
(For NLTK, entries have been sorted so that, e.g. FIRE 1 and FIRE 2
|
||||||
|
are contiguous, and not separated by FIRE'S 1.)
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions
|
||||||
|
are met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
The contents of this file are deemed to be source code.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
This work was supported in part by funding from the Defense Advanced
|
||||||
|
Research Projects Agency, the Office of Naval Research and the National
|
||||||
|
Science Foundation of the United States of America, and by member
|
||||||
|
companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge
|
||||||
|
the contributions of many volunteers to the expansion and improvement of
|
||||||
|
this dictionary.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
|
||||||
|
ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||||
|
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
|
||||||
|
NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
133737
nltk_data/corpora/cmudict/cmudict
Normal file
133737
nltk_data/corpora/cmudict/cmudict
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
98
nltk_data/tokenizers/punkt/PY3/README
Normal file
98
nltk_data/tokenizers/punkt/PY3/README
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
|
||||||
|
|
||||||
|
Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
|
||||||
|
been contributed by various people using NLTK for sentence boundary detection.
|
||||||
|
|
||||||
|
For information about how to use these models, please confer the tokenization HOWTO:
|
||||||
|
http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
|
||||||
|
and chapter 3.8 of the NLTK book:
|
||||||
|
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
|
||||||
|
|
||||||
|
There are pretrained tokenizers for the following languages:
|
||||||
|
|
||||||
|
File Language Source Contents Size of training corpus(in tokens) Model contributed by
|
||||||
|
=======================================================================================================================================================================
|
||||||
|
czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
|
||||||
|
Literarni Noviny
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
|
||||||
|
(Berlingske Avisdata, Copenhagen) Weekend Avisen
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
|
||||||
|
(American)
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
|
||||||
|
Text Bank (Suomen Kielen newspapers
|
||||||
|
Tekstipankki)
|
||||||
|
Finnish Center for IT Science
|
||||||
|
(CSC)
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
|
||||||
|
(European)
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
|
||||||
|
(Switzerland) CD-ROM
|
||||||
|
(Uses "ss"
|
||||||
|
instead of "ß")
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
|
||||||
|
(Bokmål and Information Technologies,
|
||||||
|
Nynorsk) Bergen
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
|
||||||
|
(http://www.nkjp.pl/)
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
|
||||||
|
(Brazilian) (Linguateca)
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
|
||||||
|
Slovene Academy for Arts
|
||||||
|
and Sciences
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
|
||||||
|
(European)
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
|
||||||
|
(and some other texts)
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
|
||||||
|
(Türkçe Derlem Projesi)
|
||||||
|
University of Ankara
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
|
||||||
|
Unicode using the codecs module.
|
||||||
|
|
||||||
|
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
|
||||||
|
Computational Linguistics 32: 485-525.
|
||||||
|
|
||||||
|
---- Training Code ----
|
||||||
|
|
||||||
|
# import punkt
|
||||||
|
import nltk.tokenize.punkt
|
||||||
|
|
||||||
|
# Make a new Tokenizer
|
||||||
|
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
|
||||||
|
|
||||||
|
# Read in training corpus (one example: Slovene)
|
||||||
|
import codecs
|
||||||
|
text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
|
||||||
|
|
||||||
|
# Train tokenizer
|
||||||
|
tokenizer.train(text)
|
||||||
|
|
||||||
|
# Dump pickled tokenizer
|
||||||
|
import pickle
|
||||||
|
out = open("slovene.pickle","wb")
|
||||||
|
pickle.dump(tokenizer, out)
|
||||||
|
out.close()
|
||||||
|
|
||||||
|
---------
|
||||||
BIN
nltk_data/tokenizers/punkt/PY3/czech.pickle
Normal file
BIN
nltk_data/tokenizers/punkt/PY3/czech.pickle
Normal file
Binary file not shown.
BIN
nltk_data/tokenizers/punkt/PY3/danish.pickle
Normal file
BIN
nltk_data/tokenizers/punkt/PY3/danish.pickle
Normal file
Binary file not shown.
BIN
nltk_data/tokenizers/punkt/PY3/dutch.pickle
Normal file
BIN
nltk_data/tokenizers/punkt/PY3/dutch.pickle
Normal file
Binary file not shown.
BIN
nltk_data/tokenizers/punkt/PY3/english.pickle
Normal file
BIN
nltk_data/tokenizers/punkt/PY3/english.pickle
Normal file
Binary file not shown.
BIN
nltk_data/tokenizers/punkt/PY3/estonian.pickle
Normal file
BIN
nltk_data/tokenizers/punkt/PY3/estonian.pickle
Normal file
Binary file not shown.
BIN
nltk_data/tokenizers/punkt/PY3/finnish.pickle
Normal file
BIN
nltk_data/tokenizers/punkt/PY3/finnish.pickle
Normal file
Binary file not shown.
BIN
nltk_data/tokenizers/punkt/PY3/french.pickle
Normal file
BIN
nltk_data/tokenizers/punkt/PY3/french.pickle
Normal file
Binary file not shown.
BIN
nltk_data/tokenizers/punkt/PY3/german.pickle
Normal file
BIN
nltk_data/tokenizers/punkt/PY3/german.pickle
Normal file
Binary file not shown.
BIN
nltk_data/tokenizers/punkt/PY3/greek.pickle
Normal file
BIN
nltk_data/tokenizers/punkt/PY3/greek.pickle
Normal file
Binary file not shown.
BIN
nltk_data/tokenizers/punkt/PY3/italian.pickle
Normal file
BIN
nltk_data/tokenizers/punkt/PY3/italian.pickle
Normal file
Binary file not shown.
BIN
nltk_data/tokenizers/punkt/PY3/malayalam.pickle
Normal file
BIN
nltk_data/tokenizers/punkt/PY3/malayalam.pickle
Normal file
Binary file not shown.
BIN
nltk_data/tokenizers/punkt/PY3/norwegian.pickle
Normal file
BIN
nltk_data/tokenizers/punkt/PY3/norwegian.pickle
Normal file
Binary file not shown.
BIN
nltk_data/tokenizers/punkt/PY3/polish.pickle
Normal file
BIN
nltk_data/tokenizers/punkt/PY3/polish.pickle
Normal file
Binary file not shown.
BIN
nltk_data/tokenizers/punkt/PY3/portuguese.pickle
Normal file
BIN
nltk_data/tokenizers/punkt/PY3/portuguese.pickle
Normal file
Binary file not shown.
BIN
nltk_data/tokenizers/punkt/PY3/russian.pickle
Normal file
BIN
nltk_data/tokenizers/punkt/PY3/russian.pickle
Normal file
Binary file not shown.
BIN
nltk_data/tokenizers/punkt/PY3/slovene.pickle
Normal file
BIN
nltk_data/tokenizers/punkt/PY3/slovene.pickle
Normal file
Binary file not shown.
BIN
nltk_data/tokenizers/punkt/PY3/spanish.pickle
Normal file
BIN
nltk_data/tokenizers/punkt/PY3/spanish.pickle
Normal file
Binary file not shown.
BIN
nltk_data/tokenizers/punkt/PY3/swedish.pickle
Normal file
BIN
nltk_data/tokenizers/punkt/PY3/swedish.pickle
Normal file
Binary file not shown.
BIN
nltk_data/tokenizers/punkt/PY3/turkish.pickle
Normal file
BIN
nltk_data/tokenizers/punkt/PY3/turkish.pickle
Normal file
Binary file not shown.
98
nltk_data/tokenizers/punkt/README
Normal file
98
nltk_data/tokenizers/punkt/README
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
|
||||||
|
|
||||||
|
Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
|
||||||
|
been contributed by various people using NLTK for sentence boundary detection.
|
||||||
|
|
||||||
|
For information about how to use these models, please confer the tokenization HOWTO:
|
||||||
|
http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
|
||||||
|
and chapter 3.8 of the NLTK book:
|
||||||
|
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
|
||||||
|
|
||||||
|
There are pretrained tokenizers for the following languages:
|
||||||
|
|
||||||
|
File Language Source Contents Size of training corpus(in tokens) Model contributed by
|
||||||
|
=======================================================================================================================================================================
|
||||||
|
czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
|
||||||
|
Literarni Noviny
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
|
||||||
|
(Berlingske Avisdata, Copenhagen) Weekend Avisen
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
|
||||||
|
(American)
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
|
||||||
|
Text Bank (Suomen Kielen newspapers
|
||||||
|
Tekstipankki)
|
||||||
|
Finnish Center for IT Science
|
||||||
|
(CSC)
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
|
||||||
|
(European)
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
|
||||||
|
(Switzerland) CD-ROM
|
||||||
|
(Uses "ss"
|
||||||
|
instead of "ß")
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
|
||||||
|
(Bokmål and Information Technologies,
|
||||||
|
Nynorsk) Bergen
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
|
||||||
|
(http://www.nkjp.pl/)
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
|
||||||
|
(Brazilian) (Linguateca)
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
|
||||||
|
Slovene Academy for Arts
|
||||||
|
and Sciences
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
|
||||||
|
(European)
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
|
||||||
|
(and some other texts)
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
|
||||||
|
(Türkçe Derlem Projesi)
|
||||||
|
University of Ankara
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
|
||||||
|
Unicode using the codecs module.
|
||||||
|
|
||||||
|
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
|
||||||
|
Computational Linguistics 32: 485-525.
|
||||||
|
|
||||||
|
---- Training Code ----
|
||||||
|
|
||||||
|
# import punkt
|
||||||
|
import nltk.tokenize.punkt
|
||||||
|
|
||||||
|
# Make a new Tokenizer
|
||||||
|
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
|
||||||
|
|
||||||
|
# Read in training corpus (one example: Slovene)
|
||||||
|
import codecs
|
||||||
|
text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
|
||||||
|
|
||||||
|
# Train tokenizer
|
||||||
|
tokenizer.train(text)
|
||||||
|
|
||||||
|
# Dump pickled tokenizer
|
||||||
|
import pickle
|
||||||
|
out = open("slovene.pickle","wb")
|
||||||
|
pickle.dump(tokenizer, out)
|
||||||
|
out.close()
|
||||||
|
|
||||||
|
---------
|
||||||
159140
nltk_data/tokenizers/punkt/czech.pickle
Normal file
159140
nltk_data/tokenizers/punkt/czech.pickle
Normal file
File diff suppressed because it is too large
Load Diff
162767
nltk_data/tokenizers/punkt/danish.pickle
Normal file
162767
nltk_data/tokenizers/punkt/danish.pickle
Normal file
File diff suppressed because it is too large
Load Diff
97138
nltk_data/tokenizers/punkt/dutch.pickle
Normal file
97138
nltk_data/tokenizers/punkt/dutch.pickle
Normal file
File diff suppressed because it is too large
Load Diff
61702
nltk_data/tokenizers/punkt/english.pickle
Normal file
61702
nltk_data/tokenizers/punkt/english.pickle
Normal file
File diff suppressed because it is too large
Load Diff
206369
nltk_data/tokenizers/punkt/estonian.pickle
Normal file
206369
nltk_data/tokenizers/punkt/estonian.pickle
Normal file
File diff suppressed because it is too large
Load Diff
240379
nltk_data/tokenizers/punkt/finnish.pickle
Normal file
240379
nltk_data/tokenizers/punkt/finnish.pickle
Normal file
File diff suppressed because it is too large
Load Diff
80529
nltk_data/tokenizers/punkt/french.pickle
Normal file
80529
nltk_data/tokenizers/punkt/french.pickle
Normal file
File diff suppressed because it is too large
Load Diff
181299
nltk_data/tokenizers/punkt/german.pickle
Normal file
181299
nltk_data/tokenizers/punkt/german.pickle
Normal file
File diff suppressed because it is too large
Load Diff
89257
nltk_data/tokenizers/punkt/greek.pickle
Normal file
89257
nltk_data/tokenizers/punkt/greek.pickle
Normal file
File diff suppressed because it is too large
Load Diff
90202
nltk_data/tokenizers/punkt/italian.pickle
Normal file
90202
nltk_data/tokenizers/punkt/italian.pickle
Normal file
File diff suppressed because it is too large
Load Diff
BIN
nltk_data/tokenizers/punkt/malayalam.pickle
Normal file
BIN
nltk_data/tokenizers/punkt/malayalam.pickle
Normal file
Binary file not shown.
162978
nltk_data/tokenizers/punkt/norwegian.pickle
Normal file
162978
nltk_data/tokenizers/punkt/norwegian.pickle
Normal file
File diff suppressed because it is too large
Load Diff
245172
nltk_data/tokenizers/punkt/polish.pickle
Normal file
245172
nltk_data/tokenizers/punkt/polish.pickle
Normal file
File diff suppressed because it is too large
Load Diff
90795
nltk_data/tokenizers/punkt/portuguese.pickle
Normal file
90795
nltk_data/tokenizers/punkt/portuguese.pickle
Normal file
File diff suppressed because it is too large
Load Diff
BIN
nltk_data/tokenizers/punkt/russian.pickle
Normal file
BIN
nltk_data/tokenizers/punkt/russian.pickle
Normal file
Binary file not shown.
106925
nltk_data/tokenizers/punkt/slovene.pickle
Normal file
106925
nltk_data/tokenizers/punkt/slovene.pickle
Normal file
File diff suppressed because it is too large
Load Diff
82636
nltk_data/tokenizers/punkt/spanish.pickle
Normal file
82636
nltk_data/tokenizers/punkt/spanish.pickle
Normal file
File diff suppressed because it is too large
Load Diff
133719
nltk_data/tokenizers/punkt/swedish.pickle
Normal file
133719
nltk_data/tokenizers/punkt/swedish.pickle
Normal file
File diff suppressed because it is too large
Load Diff
138187
nltk_data/tokenizers/punkt/turkish.pickle
Normal file
138187
nltk_data/tokenizers/punkt/turkish.pickle
Normal file
File diff suppressed because it is too large
Load Diff
3
webui.py
3
webui.py
@ -3,6 +3,9 @@ import os
|
|||||||
import shutil
|
import shutil
|
||||||
from chains.local_doc_qa import LocalDocQA
|
from chains.local_doc_qa import LocalDocQA
|
||||||
from configs.model_config import *
|
from configs.model_config import *
|
||||||
|
import nltk
|
||||||
|
|
||||||
|
nltk.data.path = [os.path.join(os.path.dirname(__file__), "nltk_data")] + nltk.data.path
|
||||||
|
|
||||||
# return top-k text chunk from vector store
|
# return top-k text chunk from vector store
|
||||||
VECTOR_SEARCH_TOP_K = 6
|
VECTOR_SEARCH_TOP_K = 6
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user