= tensor([0,1,2])
t = reverse_text(t)
r 2,1,0])) test_eq(r, tensor([
文本数据
Datasets
中收集文本数据的功能和转换。向后
反转文本可以与前向模型进行集成,提供更高的准确性。只需要一个 type_tfm
,它将在文本导入时对其进行反转。
reverse_text
reverse_text (x)
数值化
数值化是将 token 转换为整数的步骤。第一步是建立 token 到索引的对应关系,这称为 vocab。
make_vocab
make_vocab (count, min_freq=3, max_vocab=60000, special_toks=None)
从 Counter
count
创建一个大小为 max_vocab
的 vocab,其中包含出现次数多于 min_freq
的项目。
如果 token 数量超过 max_vocab
,则保留出现频率最高的 token。
为了在使用混合精度时的性能考虑,词汇表的大小总是设置为 8 的倍数,可能会通过添加 xxfake
token 来实现。
= Counter(['a', 'a', 'a', 'a', 'b', 'b', 'c', 'c', 'd'])
count set([x for x in make_vocab(count) if not x.startswith('xxfake')]),
test_eq(set(defaults.text_spec_tok + 'a'.split()))
len(make_vocab(count))%8, 0)
test_eq(set([x for x in make_vocab(count, min_freq=1) if not x.startswith('xxfake')]),
test_eq(set(defaults.text_spec_tok + 'a b c d'.split()))
set([x for x in make_vocab(count,max_vocab=12, min_freq=1) if not x.startswith('xxfake')]),
test_eq(set(defaults.text_spec_tok + 'a b c'.split()))
LMTensorText
LMTensorText (x, **kwargs)
在语言建模中表示文本的张量的语义类型
TensorText
TensorText (x, **kwargs)
表示文本的张量的语义类型
Numericalize
Numericalize (vocab=None, min_freq=3, max_vocab=60000, special_toks=None)
将 token 化文本转换为数值化 ID 的可逆转换
= Numericalize(min_freq=2)
num 'This is an example of text'.split(), 'this is another text'.split())) num.setup(L(
= 'This is an example of text ' start
如果没有传入 vocab
,则在设置时从数据创建 vocab,使用 make_vocab
并带有 min_freq
和 max_vocab
参数。
= 'This is an example of text'
start = Numericalize(min_freq=1)
num 'this is another text'.split()))
num.setup(L(start.split(), set([x for x in num.vocab if not x.startswith('xxfake')]),
test_eq(set(defaults.text_spec_tok + 'This is an example of text this another'.split()))
len(num.vocab)%8, 0)
test_eq(= num(start.split())
t
11, 9, 12, 13, 14, 10]))
test_eq(t, tensor([ test_eq(num.decode(t), start.split())
= Numericalize(min_freq=2)
num 'This is an example of text'.split(), 'this is another text'.split()))
num.setup(L(set([x for x in num.vocab if not x.startswith('xxfake')]),
test_eq(set(defaults.text_spec_tok + 'is text'.split()))
len(num.vocab)%8, 0)
test_eq(= num(start.split())
t 0, 9, 0, 0, 0, 10]))
test_eq(t, tensor([f'{UNK} is {UNK} {UNK} {UNK} text'.split()) test_eq(num.decode(t),
LMDataLoader
LMDataLoader (dataset, lens=None, cache=2, bs=64, seq_len=72, num_workers=0, shuffle:bool=False, verbose:bool=False, do_setup:bool=True, pin_memory=False, timeout=0, batch_size=None, drop_last=False, indexed=None, n=None, device=None, persistent_workers=False, pin_memory_device='', wif=None, before_iter=None, after_item=None, before_batch=None, after_batch=None, after_iter=None, create_batches=None, create_item=None, create_batch=None, retain=None, get_idxs=None, sample=None, shuffle_fn=None, do_batch=None)
适用于语言建模的 DataLoader
为使其工作,dataset
应为数值化文本的集合。可以传入 lens
来优化创建过程,否则,LMDataLoader
将对 dataset
进行完整遍历以计算它们。使用 cache
来避免不必要的重复加载项。
LMDataLoader
将所有文本(可能经过 shuffle
)连接成一个大流,将其分割成 bs
个连续的句子,然后每次处理 seq_len
个。
= 4,3
bs,sl = L([0,1,2,3,4],[5,6,7,8,9,10],[11,12,13,14,15,16,17,18],[19,20],[21,22,23],[24]).map(tensor) ints
= LMDataLoader(ints, bs=bs, seq_len=sl)
dl list(dl),
test_eq(0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19, 20]]),
[[tensor([[1, 2, 3], [7, 8, 9], [13, 14, 15], [19, 20, 21]])],
tensor([[3, 4, 5], [ 9, 10, 11], [15, 16, 17], [21, 22, 23]]),
[tensor([[4, 5, 6], [10, 11, 12], [16, 17, 18], [22, 23, 24]])]]) tensor([[
= LMDataLoader(ints, bs=bs, seq_len=sl, shuffle=True)
dl for x,y in dl: test_eq(x[:,1:], y[:,:-1])
= tuple(dl)
((x0,y0), (x1,y1)) #Second batch begins where first batch ended
-1], x1[:,0])
test_eq(y0[:,type(x0), LMTensorText) test_eq(
分类
对于分类任务,我们通过使用填充(padding)来处理文本长度不一致的问题。
Pad_Input
Pad_Input (enc=None, dec=None, split_idx=None, order=None)
一个总是接受元组作为项的 transform
pad_idx
用于填充,填充应用于样本的 pad_fields
。如果 pad_first
为 True
,则在开头进行填充;如果添加了 backwards
,则张量会被翻转。
1,2,3]),1), (tensor([4,5]), 2), (tensor([6]), 3)], pad_idx=0),
test_eq(pad_input([(tensor([1,2,3]),1), (tensor([4,5,0]),2), (tensor([6,0,0]), 3)])
[(tensor([1,2,3]), (tensor([6]))), (tensor([4,5]), tensor([4,5])), (tensor([6]), (tensor([1,2,3])))], pad_idx=0, pad_fields=1),
test_eq(pad_input([(tensor([1,2,3]),(tensor([6,0,0]))), (tensor([4,5]),tensor([4,5,0])), ((tensor([6]),tensor([1, 2, 3])))])
[(tensor([1,2,3]),1), (tensor([4,5]), 2), (tensor([6]), 3)], pad_idx=0, pad_first=True),
test_eq(pad_input([(tensor([1,2,3]),1), (tensor([0,4,5]),2), (tensor([0,0,6]), 3)])
[(tensor([1,2,3]),1), (tensor([4,5]), 2), (tensor([6]), 3)], pad_idx=0, backwards=True),
test_eq(pad_input([(tensor([3,2,1]),1), (tensor([5,4,0]),2), (tensor([6,0,0]), 3)])
[(tensor([= pad_input([(TensorText([1,2,3]),1), (TensorText([4,5]), 2), (TensorText([6]), 3)], pad_idx=0)
x 1,2,3]),1), (tensor([4,5,0]), 2), (tensor([6,0,0]), 3)])
test_eq(x, [(tensor([1][0]), tensor([4,5])) test_eq(pad_input.decode(x[
使用 pad_idx
将 x
填充到长度 pad_len
。如果 pad_first
为 false,则所有填充都附加到 x
的末尾,直到 x
的长度达到 pad_len
。否则,如果 pad_first
为 true,则将大小为 seq_len
的块前置到 x
的开头,剩余的填充附加到 x
的末尾。
pad_chunk
pad_chunk (x, pad_idx=1, pad_first=True, seq_len=72, pad_len=10)
通过添加大小为 seq_len
的填充块来填充 x
print('pad_first: ',pad_chunk(torch.tensor([1,2,3]),seq_len=3,pad_idx=0,pad_len=8))
print('pad_last: ',pad_chunk(torch.tensor([1,2,3]),seq_len=3,pad_idx=0,pad_len=8,pad_first=False))
pad_first: tensor([0, 0, 0, 1, 2, 3, 0, 0])
pad_last: tensor([1, 2, 3, 0, 0, 0, 0, 0])
pad_input_chunk
是 pad_chunk
的一个版本,它作用于列表的列表。
pad_input_chunk
pad_input_chunk (samples, n_inp=1, pad_idx=1, pad_first=True, seq_len=72, pad_len=10)
通过添加大小为 seq_len
的填充块来填充 samples
与基础的 pad_input
的区别在于,大部分填充(如果 pad_first=True
则在开头,如果 pad_first=False
则在末尾)仅以 seq_len
的整数倍应用。剩余的填充应用于末尾(如果 pad_first=False
则在开头)。这是为了与带有循环模型的 SequenceEncoder
一起使用。
1,2,3,4,5,6]),TensorText([1,2]),1)], pad_idx=0, seq_len=3,n_inp=2) pad_input_chunk([(TensorText([
[(TensorText([1, 2, 3, 4, 5, 6]), TensorText([0, 0, 0, 1, 2, 0]), 1)]
1,2,3,4,5,6]),1), (tensor([1,2,3]), 2), (tensor([1,2]), 3)], pad_idx=0, seq_len=2),
test_eq(pad_input_chunk([(tensor([1,2,3,4,5,6]),1), (tensor([0,0,1,2,3,0]),2), (tensor([0,0,0,0,1,2]), 3)])
[(tensor([1,2,3,4,5,6]),), (tensor([1,2,3]),), (tensor([1,2]),)], pad_idx=0, seq_len=2),
test_eq(pad_input_chunk([(tensor([1,2,3,4,5,6]),), (tensor([0,0,1,2,3,0]),), (tensor([0,0,0,0,1,2]),)])
[(tensor([1,2,3,4,5,6]),), (tensor([1,2,3]),), (tensor([1,2]),)], pad_idx=0, seq_len=2, pad_first=False),
test_eq(pad_input_chunk([(tensor([1,2,3,4,5,6]),), (tensor([1,2,3,0,0,0]),), (tensor([1,2,0,0,0,0]),)])
[(tensor([
1,2,3,4,5,6]),TensorText([1,2]),1)], pad_idx=0, seq_len=2,n_inp=2),
test_eq(pad_input_chunk([(TensorText([1,2,3,4,5,6]),TensorText([0,0,0,0,1,2]),1)]) [(TensorText([
pad_input_chunk
的 Transform
版本。此版本支持类型、解码以及 Transform
的其他功能。
Pad_Chunk
Pad_Chunk (pad_idx=1, pad_first=True, seq_len=72, decode=True, **kwargs)
通过添加大小为 seq_len
的填充块来填充 samples
这里是 Pad_Chunk
的一个例子。
=Pad_Chunk(pad_idx=0,seq_len=3)
pc=pc([(TensorText([1,2,3,4,5,6]),TensorText([1,2]),1)])
outprint('Inputs: ',*[(TensorText([1,2,3,4,5,6]),TensorText([1,2]),1)])
print('Encoded: ',*out)
print('Decoded: ',*pc.decode(out))
Inputs: (TensorText([1, 2, 3, 4, 5, 6]), TensorText([1, 2]), 1)
Encoded: (TensorText([1, 2, 3, 4, 5, 6]), TensorText([0, 0, 0, 1, 2, 0]), 1)
Decoded: (TensorText([1, 2, 3, 4, 5, 6]), TensorText([1, 2]), 1)
=Pad_Chunk(pad_idx=0, seq_len=2)
pc1,2,3,4,5,6]),1), (TensorText([1,2,3]), 2), (TensorText([1,2]), 3)]),
test_eq(pc([(TensorText([1,2,3,4,5,6]),1), (tensor([0,0,1,2,3,0]),2), (tensor([0,0,0,0,1,2]), 3)])
[(tensor([
=Pad_Chunk(pad_idx=0, seq_len=2)
pc1,2,3,4,5,6]),), (TensorText([1,2,3]),), (TensorText([1,2]),)]),
test_eq(pc([(TensorText([1,2,3,4,5,6]),), (tensor([0,0,1,2,3,0]),), (tensor([0,0,0,0,1,2]),)])
[(tensor([
=Pad_Chunk(pad_idx=0, seq_len=2, pad_first=False)
pc1,2,3,4,5,6]),), (TensorText([1,2,3]),), (TensorText([1,2]),)]),
test_eq(pc([(TensorText([1,2,3,4,5,6]),), (tensor([1,2,3,0,0,0]),), (tensor([1,2,0,0,0,0]),)])
[(tensor([
=Pad_Chunk(pad_idx=0, seq_len=2)
pc1,2,3,4,5,6]),TensorText([1,2]),1)]),
test_eq(pc([(TensorText([1,2,3,4,5,6]),TensorText([0,0,0,0,1,2]),1)]) [(TensorText([
SortedDL
SortedDL (dataset, sort_func=None, res=None, bs:int=64, shuffle:bool=False, num_workers:int=None, verbose:bool=False, do_setup:bool=True, pin_memory=False, timeout=0, batch_size=None, drop_last=False, indexed=None, n=None, device=None, persistent_workers=False, pin_memory_device='', wif=None, before_iter=None, after_item=None, before_batch=None, after_batch=None, after_iter=None, create_batches=None, create_item=None, create_batch=None, retain=None, get_idxs=None, sample=None, shuffle_fn=None, do_batch=None)
根据 sort_func
给定的顺序遍历各项的 DataLoader
类型 | 默认值 | 详情 | |
---|---|---|---|
dataset | 用于加载数据的映射式或迭代式数据集 | ||
sort_func | NoneType | None | |
res | NoneType | None | |
bs | int | 64 | 批量大小 |
shuffle | bool | False | 是否打乱数据 |
num_workers | int | None | 并行使用的 CPU 核心数(默认值:最多 16 个可用核心) |
verbose | bool | False | 是否打印详细日志 |
do_setup | bool | True | 是否为批量转换运行 setup() |
pin_memory | bool | False | |
timeout | int | 0 | |
batch_size | NoneType | None | |
drop_last | bool | False | |
indexed | NoneType | None | |
n | NoneType | None | |
device | NoneType | None | |
persistent_workers | bool | False | |
pin_memory_device | str | ||
wif | NoneType | None | |
before_iter | NoneType | None | |
after_item | NoneType | None | |
before_batch | NoneType | None | |
after_batch | NoneType | None | |
after_iter | NoneType | None | |
create_batches | NoneType | None | |
create_item | NoneType | None | |
create_batch | NoneType | None | |
retain | NoneType | None | |
get_idxs | NoneType | None | |
sample | NoneType | None | |
shuffle_fn | NoneType | None | |
do_batch | NoneType | None |
res
是将 sort_func
应用于 dataset
中所有元素的结果。如果可用,可以传入 res
,以避免对整个数据集进行初始遍历,从而大大加快初始化速度。例如,如果按文本长度排序(如默认的 sort_func
,称为 _default_sort
),则应向 res
传入一个包含 dataset
中每个元素长度的列表,以利用这种加速。
为了获得相同的验证集初始化加速,可以将 val_res
(验证集文本长度列表)传递给 SortedDL
的 kwargs
参数。下面是一个通过传递训练集和验证集的文本长度列表来减少初始化时间的示例。
# Pass the training dataset text lengths to SortedDL
srtd_dl=partial(SortedDL, res = train_text_lens)
# Pass the validation dataset text lengths
dl_kwargs = [{},{'val_res': val_text_lens}]
# init our Datasets
dsets = Datasets(...)
# init our Dataloaders
dls = dsets.dataloaders(...,dl_type = srtd_dl, dl_kwargs = dl_kwargs)
如果 shuffle
为 True
,这将对排序结果进行少量打乱,以便批次中的项目大小大致相同,但不是严格按照排序顺序。
= [(tensor([1,2]),1), (tensor([3,4,5,6]),2), (tensor([7]),3), (tensor([8,9,10]),4)]
ds = SortedDL(ds, bs=2, before_batch=partial(pad_input, pad_idx=0))
dl list(dl), [(tensor([[ 3, 4, 5, 6], [ 8, 9, 10, 0]]), tensor([2, 4])),
test_eq(1, 2], [7, 0]]), tensor([1, 3]))]) (tensor([[
= [(tensor(range(random.randint(1,10))),i) for i in range(101)]
ds = SortedDL(ds, bs=2, create_batch=partial(pad_input, pad_idx=-1), shuffle=True, num_workers=0)
dl = list(dl)
batches = len(batches[0][0])
max_len for b in batches:
assert(len(b[0])) <= max_len
0][-1], -1) test_ne(b[
用于文本的 TransformBlock
要使用数据块 API,您需要为文本构建此数据块。
TextBlock
TextBlock (tok_tfm, vocab=None, is_lm=False, seq_len=72, backwards=False, min_freq=3, max_vocab=60000, special_toks=None)
用于文本的 TransformBlock
为了高效地进行 tokenization,您可能想使用其中一种工厂方法。否则,您可以传入自定义的 tok_tfm
来处理 tokenization(如果您的文本已经 tokenization,可以传入 noop
),或者传入一个 vocab
,或者使用 min_freq
和 max_vocab
让其在文本上进行推理。
is_lm
指示我们是想将文本用于语言建模还是其他任务,seq_len
仅在 is_lm=False
时需要调整,并会传递给 pad_input_chunk
。
TextBlock.from_df
TextBlock.from_df (text_cols, vocab=None, is_lm=False, seq_len=72, backwards=False, min_freq=3, max_vocab=60000, tok=None, rules=None, sep=' ', n_workers=4, mark_fields=None, tok_text_col='text', **kwargs)
使用 text_cols
从 dataframe 构建一个 TextBlock
这里是一个使用存储为 CSV 文件的 IMDB 样本的示例。
= untar_data(URLs.IMDB_SAMPLE)
path = pd.read_csv(path/'texts.csv')
df
= DataBlock(
imdb_clas =(TextBlock.from_df('text', seq_len=72), CategoryBlock),
blocks=ColReader('text'), get_y=ColReader('label'), splitter=ColSplitter())
get_x
= imdb_clas.dataloaders(df, bs=64)
dls =2) dls.show_batch(max_n
text | category | |
---|---|---|
0 | xxbos xxmaj raising xxmaj victor xxmaj vargas : a xxmaj review \n\n xxmaj you know , xxmaj raising xxmaj victor xxmaj vargas is like sticking your hands into a big , xxunk bowl of xxunk . xxmaj it 's warm and gooey , but you 're not sure if it feels right . xxmaj try as i might , no matter how warm and gooey xxmaj raising xxmaj victor xxmaj vargas became i was always aware that something did n't quite feel right . xxmaj victor xxmaj vargas suffers from a certain xxunk on the director 's part . xxmaj apparently , the director thought that the ethnic backdrop of a xxmaj latino family on the lower east side , and an xxunk storyline would make the film critic proof . xxmaj he was right , but it did n't fool me . xxmaj raising xxmaj victor xxmaj vargas is | negative |
1 | xxbos xxup the xxup shop xxup around xxup the xxup corner is one of the xxunk and most feel - good romantic comedies ever made . xxmaj there 's just no getting around that , and it 's hard to actually put one 's feeling for this film into words . xxmaj it 's not one of those films that tries too hard , nor does it come up with the xxunk possible scenarios to get the two protagonists together in the end . xxmaj in fact , all its charm is xxunk , contained within the characters and the setting and the plot … which is highly believable to xxunk . xxmaj it 's easy to think that such a love story , as beautiful as any other ever told , * could * happen to you … a feeling you do n't often get from other romantic comedies | positive |
vocab
、is_lm
、seq_len
、min_freq
和 max_vocab
传递给主要的 init 方法,其他参数传递给 Tokenizer.from_df
。
TextBlock.from_folder
TextBlock.from_folder (path, vocab=None, is_lm=False, seq_len=72, backwards=False, min_freq=3, max_vocab=60000, tok=None, rules=None, extensions=None, folders=None, output_dir=None, skip_if_exists=True, output_names=None, n_workers=4, encoding='utf8', **kwargs)
从 path
构建一个 TextBlock
vocab
、is_lm
、seq_len
、min_freq
和 max_vocab
传递给主要的 init 方法,其他参数传递给 Tokenizer.from_folder
。
TextDataLoaders
TextDataLoaders (*loaders, path:str|pathlib.Path='.', device=None)
一个基础包装器,用于封装多个 DataLoader
,并带有适用于 NLP 问题的工厂方法。
类型 | 默认值 | 详情 | |
---|---|---|---|
loaders | VAR_POSITIONAL | 要包装的 DataLoader 对象 |
|
path | str | pathlib.Path | . | 存储导出对象的路径 |
device | NoneType | None | 放置 DataLoaders 的设备 |
您不应直接使用 init 方法,而应使用以下工厂方法之一。所有这些工厂方法都接受以下参数:
text_vocab
:用于数值化文本的词汇表(如果未传入,则从数据中推断)tok_tfm
:如果传入,则使用此tok_tfm
而非默认值seq_len
:用于批次的序列长度bs
:批量大小val_bs
:用于验证DataLoader
的批量大小(默认为bs
)shuffle_train
:是否打乱训练DataLoader
device
:要使用的 PyTorch 设备(默认为default_device()
)
TextDataLoaders.from_folder
TextDataLoaders.from_folder (path, train='train', valid='valid', valid_pct=None, seed=None, vocab=None, text_vocab=None, is_lm=False, tok_tfm=None, seq_len=72, splitter=None, backwards=False, bs:int=64, val_bs:int=None, shuffle:bool=True, device=None)
从 path
中具有 train
和 valid
子文件夹(或提供 valid_pct
)的 imagenet 风格数据集创建
类型 | 默认值 | 详情 | |
---|---|---|---|
path | str | pathlib.Path | . | 放入 DataLoaders 的路径 |
train | str | train | |
valid | str | valid | |
valid_pct | NoneType | None | |
seed | NoneType | None | |
vocab | NoneType | None | |
text_vocab | NoneType | None | |
is_lm | bool | False | |
tok_tfm | NoneType | None | |
seq_len | int | 72 | |
splitter | NoneType | None | |
backwards | bool | False | |
bs | int | 64 | 批量大小 |
val_bs | int | None | 验证 DataLoader 的批量大小 |
shuffle | bool | True | 是否打乱数据 |
device | NoneType | None | 放置 DataLoaders 的设备 |
如果提供了 valid_pct
,则会进行随机分割(带有可选的 seed
),将该百分比的数据留作验证集(而不是查看上级文件夹)。如果传入了 vocab
,则只保留 vocab
中名称对应的文件夹。
这里是 IMDB 电影评论数据集样本的一个示例。
= untar_data(URLs.IMDB)
path = TextDataLoaders.from_folder(path)
dls =3) dls.show_batch(max_n
text | category | |
---|---|---|
0 | xxbos xxmaj match 1 : xxmaj tag xxmaj team xxmaj table xxmaj match xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley vs xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley started things off with a xxmaj tag xxmaj team xxmaj table xxmaj match against xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit . xxmaj according to the rules of the match , both opponents have to go through tables in order to get the win . xxmaj benoit and xxmaj guerrero heated up early on by taking turns hammering first xxmaj spike and then xxmaj bubba xxmaj ray . a xxmaj german xxunk by xxmaj benoit to xxmaj bubba took the wind out of the xxmaj dudley brother . xxmaj spike tried to help his brother , but the referee restrained him while xxmaj benoit and xxmaj guerrero | pos |
1 | xxbos xxup the xxup shop xxup around xxup the xxup corner is one of the xxunk and most feel - good romantic comedies ever made . xxmaj there 's just no getting around that , and it 's hard to actually put one 's feeling for this film into words . xxmaj it 's not one of those films that tries too hard , nor does it come up with the xxunk possible scenarios to get the two protagonists together in the end . xxmaj in fact , all its charm is xxunk , contained within the characters and the setting and the plot … which is highly believable to xxunk . xxmaj it 's easy to think that such a love story , as beautiful as any other ever told , * could * happen to you … a feeling you do n't often get from other romantic comedies | neg |
2 | xxbos xxup anchors xxup aweigh sees two eager young sailors , xxmaj joe xxmaj brady ( gene xxmaj kelly ) and xxmaj clarence xxmaj doolittle / xxmaj brooklyn ( frank xxmaj sinatra ) , get a special four - day shore leave . xxmaj eager to get to the girls , particularly xxmaj joe 's xxmaj lola , neither xxmaj joe nor xxmaj brooklyn figure on the interruption of little xxmaj navy - mad xxmaj donald ( dean xxmaj stockwell ) and his xxmaj aunt xxmaj susie ( kathryn xxmaj grayson ) . xxmaj unexperienced in the ways of females and courting , xxmaj brooklyn quickly enlists xxmaj joe to help him win xxmaj aunt xxmaj susie over . xxmaj along the way , however , xxmaj joe finds himself falling for the gal he thinks belongs to his best friend . xxmaj how is xxmaj brooklyn going to take | pos |
TextDataLoaders.from_df
TextDataLoaders.from_df (df, path='.', valid_pct=0.2, seed=None, text_col=0, label_col=1, label_delim=None, y_block=None, text_vocab=None, is_lm=False, valid_col=None, tok_tfm=None, tok_text_col='text', seq_len=72, backwards=False, bs:int=64, val_bs:int=None, shuffle:bool=True, device=None)
从 path
中的 df
创建,带有 valid_pct
参数
类型 | 默认值 | 详情 | |
---|---|---|---|
df | |||
path | str | pathlib.Path | . | 放入 DataLoaders 的路径 |
valid_pct | float | 0.2 | |
seed | NoneType | None | |
text_col | int | 0 | |
label_col | int | 1 | |
label_delim | NoneType | None | |
y_block | NoneType | None | |
text_vocab | NoneType | None | |
is_lm | bool | False | |
valid_col | NoneType | None | |
tok_tfm | NoneType | None | |
tok_text_col | str | text | |
seq_len | int | 72 | |
backwards | bool | False | |
bs | int | 64 | 批量大小 |
val_bs | int | None | 验证 DataLoader 的批量大小 |
shuffle | bool | True | 是否打乱数据 |
device | NoneType | None | 放置 DataLoaders 的设备 |
可以选择传入 seed
以保证可重现性。text_col
、label_col
以及可选的 valid_col
是用于文本/标签和验证标志的列索引或名称。如果您的标签在一个列中,并由特定字符分隔,则可以为多标签问题传入 label_delim
。应传入 y_block
来指示您的目标类型,以防库无法正确推断。
此外,您可以使用 tok_text_col
指定 token 化文本存储到的具体列。默认情况下,token 化后它们存储在一个名为 text
的列中。
这里是一些 IMDB 子集的示例。
= untar_data(URLs.IMDB_SAMPLE) path
= pd.read_csv(path/"texts.csv"); df.head() df
label | text | is_valid | |
---|---|---|---|
0 | negative | Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff! | False |
1 | positive | This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som... | False |
2 | negative | Every once in a long while a movie will come along that will be so awful that I feel compelled to warn people. If I labor all my days and I can save but one soul from watching this movie, how great will be my joy.<br /><br />Where to begin my discussion of pain. For starters, there was a musical montage every five minutes. There was no character development. Every character was a stereotype. We had swearing guy, fat guy who eats donuts, goofy foreign guy, etc. The script felt as if it were being written as the movie was being shot. The production value was so incredibly low that it felt li... | False |
3 | positive | Name just says it all. I watched this movie with my dad when it came out and having served in Korea he had great admiration for the man. The disappointing thing about this film is that it only concentrate on a short period of the man's life - interestingly enough the man's entire life would have made such an epic bio-pic that it is staggering to imagine the cost for production.<br /><br />Some posters elude to the flawed characteristics about the man, which are cheap shots. The theme of the movie "Duty, Honor, Country" are not just mere words blathered from the lips of a high-brassed offic... | False |
4 | negative | This movie succeeds at being one of the most unique movies you've seen. However this comes from the fact that you can't make heads or tails of this mess. It almost seems as a series of challenges set up to determine whether or not you are willing to walk out of the movie and give up the money you just paid. If you don't want to feel slighted you'll sit through this horrible film and develop a real sense of pity for the actors involved, they've all seen better days, but then you realize they actually got paid quite a bit of money to do this and you'll lose pity for them just like you've alr... | False |
= untar_data(URLs.IMDB_SAMPLE)
path = pd.read_csv(path/"texts.csv")
df = TextDataLoaders.from_df(df, path=path, text_col='text', label_col='label', valid_col='is_valid')
dls =3) dls.show_batch(max_n
text | category | |
---|---|---|
0 | xxbos xxmaj raising xxmaj victor xxmaj vargas : a xxmaj review \n\n xxmaj you know , xxmaj raising xxmaj victor xxmaj vargas is like sticking your hands into a big , xxunk bowl of xxunk . xxmaj it 's warm and gooey , but you 're not sure if it feels right . xxmaj try as i might , no matter how warm and gooey xxmaj raising xxmaj victor xxmaj vargas became i was always aware that something did n't quite feel right . xxmaj victor xxmaj vargas suffers from a certain xxunk on the director 's part . xxmaj apparently , the director thought that the ethnic backdrop of a xxmaj latino family on the lower east side , and an xxunk storyline would make the film critic proof . xxmaj he was right , but it did n't fool me . xxmaj raising xxmaj victor xxmaj vargas is | negative |
1 | xxbos xxup the xxup shop xxup around xxup the xxup corner is one of the xxunk and most feel - good romantic comedies ever made . xxmaj there 's just no getting around that , and it 's hard to actually put one 's feeling for this film into words . xxmaj it 's not one of those films that tries too hard , nor does it come up with the xxunk possible scenarios to get the two protagonists together in the end . xxmaj in fact , all its charm is xxunk , contained within the characters and the setting and the plot … which is highly believable to xxunk . xxmaj it 's easy to think that such a love story , as beautiful as any other ever told , * could * happen to you … a feeling you do n't often get from other romantic comedies | positive |
2 | xxbos xxmaj now that xxmaj che(2008 ) has finished its relatively short xxmaj australian cinema run ( extremely limited xxunk screen in xxmaj xxunk , after xxunk ) , i can xxunk join both xxunk of " at xxmaj the xxmaj movies " in taking xxmaj steven xxmaj soderbergh to task . \n\n xxmaj it 's usually satisfying to watch a film director change his style / subject , but xxmaj soderbergh 's most recent stinker , xxmaj the xxmaj girlfriend xxunk ) , was also missing a story , so narrative ( and editing ? ) seem to suddenly be xxmaj soderbergh 's main challenge . xxmaj strange , after 20 - odd years in the business . xxmaj he was probably never much good at narrative , just xxunk it well inside " edgy " projects . \n\n xxmaj none of this excuses him this present , | negative |
= TextDataLoaders.from_df(df, path=path, text_col='text', is_lm=True, valid_col='is_valid')
dls =3) dls.show_batch(max_n
text | text_ | |
---|---|---|
0 | xxbos xxmaj critics need to review what they class as a quality movie . i think the critics have seen too many actions films and have xxunk to the xxmaj matrix style of films . xxmaj xxunk is a breath of fresh air , a film with so many layers that one viewing is not enough to understand or appreciate this outstanding film . xxmaj xxunk von xxmaj xxunk shows that old | xxmaj critics need to review what they class as a quality movie . i think the critics have seen too many actions films and have xxunk to the xxmaj matrix style of films . xxmaj xxunk is a breath of fresh air , a film with so many layers that one viewing is not enough to understand or appreciate this outstanding film . xxmaj xxunk von xxmaj xxunk shows that old styles |
1 | xxmaj xxunk is something ) , but noticeable moments of xxunk as he still struggles to find his humanity . xxmaj this xxunk of his for a real life could get boring , and almost did in xxmaj supremacy , but just works better in xxmaj ultimatum ( better script ) . \n\n i am reminded of a scene in " xxunk " ( the only good xxmaj pierce xxmaj xxunk xxmaj | xxunk is something ) , but noticeable moments of xxunk as he still struggles to find his humanity . xxmaj this xxunk of his for a real life could get boring , and almost did in xxmaj supremacy , but just works better in xxmaj ultimatum ( better script ) . \n\n i am reminded of a scene in " xxunk " ( the only good xxmaj pierce xxmaj xxunk xxmaj bond |
2 | xxmaj mr . xxmaj julia , played his role equally as perfect . xxmaj it was interesting to see how reluctant xxmaj richard xxmaj dreyfuss was in replacing the dictator against his will . xxmaj but he became more confident and comfortable with the role as time passed . xxmaj since everything happens for a reason in life , i believe he was forced to replace the dictator because he was meant | mr . xxmaj julia , played his role equally as perfect . xxmaj it was interesting to see how reluctant xxmaj richard xxmaj dreyfuss was in replacing the dictator against his will . xxmaj but he became more confident and comfortable with the role as time passed . xxmaj since everything happens for a reason in life , i believe he was forced to replace the dictator because he was meant to |
TextDataLoaders.from_csv
TextDataLoaders.from_csv (path, csv_fname='labels.csv', header='infer', delimiter=None, quoting=0, valid_pct=0.2, seed=None, text_col=0, label_col=1, label_delim=None, y_block=None, text_vocab=None, is_lm=False, valid_col=None, tok_tfm=None, tok_text_col='text', seq_len=72, backwards=False, bs:int=64, val_bs:int=None, shuffle:bool=True, device=None)
从 path/csv_fname
中的 csv
文件创建
类型 | 默认值 | 详情 | |
---|---|---|---|
path | str | pathlib.Path | . | 放入 DataLoaders 的路径 |
csv_fname | str | labels.csv | |
header | str | infer | |
delimiter | NoneType | None | |
quoting | int | 0 | |
valid_pct | float | 0.2 | |
seed | NoneType | None | |
text_col | int | 0 | |
label_col | int | 1 | |
label_delim | NoneType | None | |
y_block | NoneType | None | |
text_vocab | NoneType | None | |
is_lm | bool | False | |
valid_col | NoneType | None | |
tok_tfm | NoneType | None | |
tok_text_col | str | text | |
seq_len | int | 72 | |
backwards | bool | False | |
bs | int | 64 | 批量大小 |
val_bs | int | None | 验证 DataLoader 的批量大小 |
shuffle | bool | True | 是否打乱数据 |
device | NoneType | None | 放置 DataLoaders 的设备 |
使用 header
和 delimiter
打开 csv 文件,然后将所有其他参数传递给 TextDataLoaders.from_df
。
= TextDataLoaders.from_csv(path=path, csv_fname='texts.csv', text_col='text', label_col='label', valid_col='is_valid')
dls =3) dls.show_batch(max_n
text | category | |
---|---|---|
0 | xxbos xxmaj raising xxmaj victor xxmaj vargas : a xxmaj review \n\n xxmaj you know , xxmaj raising xxmaj victor xxmaj vargas is like sticking your hands into a big , xxunk bowl of xxunk . xxmaj it 's warm and gooey , but you 're not sure if it feels right . xxmaj try as i might , no matter how warm and gooey xxmaj raising xxmaj victor xxmaj vargas became i was always aware that something did n't quite feel right . xxmaj victor xxmaj vargas suffers from a certain xxunk on the director 's part . xxmaj apparently , the director thought that the ethnic backdrop of a xxmaj latino family on the lower east side , and an xxunk storyline would make the film critic proof . xxmaj he was right , but it did n't fool me . xxmaj raising xxmaj victor xxmaj vargas is | negative |
1 | xxbos xxup the xxup shop xxup around xxup the xxup corner is one of the xxunk and most feel - good romantic comedies ever made . xxmaj there 's just no getting around that , and it 's hard to actually put one 's feeling for this film into words . xxmaj it 's not one of those films that tries too hard , nor does it come up with the xxunk possible scenarios to get the two protagonists together in the end . xxmaj in fact , all its charm is xxunk , contained within the characters and the setting and the plot … which is highly believable to xxunk . xxmaj it 's easy to think that such a love story , as beautiful as any other ever told , * could * happen to you … a feeling you do n't often get from other romantic comedies | positive |
2 | xxbos xxmaj now that xxmaj che(2008 ) has finished its relatively short xxmaj australian cinema run ( extremely limited xxunk screen in xxmaj xxunk , after xxunk ) , i can xxunk join both xxunk of " at xxmaj the xxmaj movies " in taking xxmaj steven xxmaj soderbergh to task . \n\n xxmaj it 's usually satisfying to watch a film director change his style / subject , but xxmaj soderbergh 's most recent stinker , xxmaj the xxmaj girlfriend xxunk ) , was also missing a story , so narrative ( and editing ? ) seem to suddenly be xxmaj soderbergh 's main challenge . xxmaj strange , after 20 - odd years in the business . xxmaj he was probably never much good at narrative , just xxunk it well inside " edgy " projects . \n\n xxmaj none of this excuses him this present , | negative |