Thank you for your response. I was able to get the function working now.
Now I’m preparing the train/test files with your tokenizer pre-trained model.
I want to perform word level processing (e.g. tokens are like ‘鄧慧穎’, with the label ‘S-Person’), I saw that the model I am using ‘hanlp.pretrained.ner.MSRA_NER_ELECTRA_SMALL_ZH’ uses similar tag according to their train.log.
But I am still getting an error saying it cannot load the training file. Sample output below:
{
"adam_epsilon": 1e-08,
"average_subwords": false,
"batch_max_tokens": null,
"batch_size": 32,
"char_level": false,
"classpath": "hanlp.components.ner.transformer_ner.TransformerNamedEntityRecognizer",
"crf": false,
"delimiter_in_entity": null,
"epochs": 100,
"extra_embeddings": null,
"finetune": "https://file.hankcs.com/hanlp/ner/msra_ner_electra_small_20220215_205503.zip",
"grad_norm": 5.0,
"gradient_accumulation": 1,
"hanlp_version": "2.1.0-beta.45",
"hard_constraint": false,
"hidden_dropout": null,
"layer_dropout": 0,
"lr": 5e-05,
"max_seq_len": null,
"merge_types": null,
"mix_embedding": 0,
"patience": 5,
"reduction": "sum",
"ret_raw_hidden_states": false,
"sampler_builder": null,
"scalar_mix": null,
"secondary_encoder": null,
"seed": 1678702763,
"sent_delimiter": null,
"tagset": null,
"token_key": null,
"transform": null,
"transformer": "albert_base_zh",
"transformer_grad_norm": null,
"transformer_layers": null,
"transformer_lr": null,
"warmup_steps": 0.1,
"weight_decay": 0,
"word_dropout": 0.2
}
Finetune model loaded with 12686113/12686113 trainable/total parameters.
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
~/opt/anaconda3/lib/python3.8/site-packages/hanlp/utils/io_util.py in generate_words_tags_from_tsv(tsv_file_path, lower, gold, max_seq_length, sent_delimiter, char_level, hard_constraint)
461 try:
--> 462 tags = [cells[1] for cells in sent]
463 except:
~/opt/anaconda3/lib/python3.8/site-packages/hanlp/utils/io_util.py in <listcomp>(.0)
461 try:
--> 462 tags = [cells[1] for cells in sent]
463 except:
IndexError: list index out of range
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-333-03009f045040> in <module>
8 test_dataset="NER_Test.tsv"
9
---> 10 recognizer.fit(train_dataset,
11 test_dataset,
12 save_dir,
~/opt/anaconda3/lib/python3.8/site-packages/hanlp/components/ner/transformer_ner.py in fit(self, trn_data, dev_data, save_dir, transformer, delimiter_in_entity, merge_types, average_subwords, word_dropout, hidden_dropout, layer_dropout, scalar_mix, grad_norm, lr, transformer_lr, adam_epsilon, weight_decay, warmup_steps, crf, secondary_encoder, reduction, batch_size, sampler_builder, epochs, tagset, token_key, max_seq_len, sent_delimiter, char_level, hard_constraint, transform, logger, seed, devices, **kwargs)
200 The best metrics on training set.
201 """
--> 202 return super().fit(**merge_locals_kwargs(locals(), kwargs))
203
204 def build_vocabs(self, trn, logger, **kwargs):
~/opt/anaconda3/lib/python3.8/site-packages/hanlp/components/taggers/transformers/transformer_tagger.py in fit(self, trn_data, dev_data, save_dir, transformer, average_subwords, word_dropout, hidden_dropout, layer_dropout, scalar_mix, mix_embedding, grad_norm, transformer_grad_norm, lr, transformer_lr, transformer_layers, gradient_accumulation, adam_epsilon, weight_decay, warmup_steps, secondary_encoder, extra_embeddings, crf, reduction, batch_size, sampler_builder, epochs, patience, token_key, max_seq_len, sent_delimiter, char_level, hard_constraint, transform, logger, devices, **kwargs)
250 devices: Union[float, int, List[int]] = None,
251 **kwargs):
--> 252 return super().fit(**merge_locals_kwargs(locals(), kwargs))
253
254 def feed_batch(self, batch: dict):
~/opt/anaconda3/lib/python3.8/site-packages/hanlp/components/classifiers/transformer_classifier.py in fit(self, trn_data, dev_data, save_dir, transformer, lr, transformer_lr, adam_epsilon, weight_decay, warmup_steps, batch_size, gradient_accumulation, grad_norm, transformer_grad_norm, average_subwords, scalar_mix, word_dropout, hidden_dropout, max_seq_len, ret_raw_hidden_states, batch_max_tokens, epochs, logger, devices, **kwargs)
106 devices: Union[float, int, List[int]] = None,
107 **kwargs):
--> 108 return super().fit(**merge_locals_kwargs(locals(), kwargs))
109
110 def on_config_ready(self, **kwargs):
~/opt/anaconda3/lib/python3.8/site-packages/hanlp/common/torch_component.py in fit(self, trn_data, dev_data, save_dir, batch_size, epochs, devices, logger, seed, finetune, eval_trn, _device_placeholder, **kwargs)
253 f'/{sum(p.numel() for p in self.model.parameters())} trainable/total parameters.')
254 self.on_config_ready(**self.config, save_dir=save_dir)
--> 255 trn = self.build_dataloader(**merge_dict(config, data=trn_data, batch_size=batch_size, shuffle=True,
256 training=True, device=first_device, logger=logger, vocabs=self.vocabs,
257 overwrite=True))
~/opt/anaconda3/lib/python3.8/site-packages/hanlp/components/taggers/transformers/transformer_tagger.py in build_dataloader(self, data, batch_size, shuffle, device, logger, sampler_builder, gradient_accumulation, extra_embeddings, transform, max_seq_len, **kwargs)
162 args = dict((k, self.config.get(k, None)) for k in
163 ['delimiter', 'max_seq_len', 'sent_delimiter', 'char_level', 'hard_constraint'])
--> 164 dataset = self.build_dataset(data, **args)
165 if self.config.token_key is None:
166 self.config.token_key = next(iter(dataset[0]))
~/opt/anaconda3/lib/python3.8/site-packages/hanlp/components/ner/transformer_ner.py in build_dataset(self, data, transform, **kwargs)
213
214 def build_dataset(self, data, transform=None, **kwargs):
--> 215 dataset = super().build_dataset(data, transform, **kwargs)
216 if isinstance(data, str):
217 tagset = self.config.get('tagset', None)
~/opt/anaconda3/lib/python3.8/site-packages/hanlp/components/taggers/transformers/transformer_tagger.py in build_dataset(self, data, transform, **kwargs)
189
190 def build_dataset(self, data, transform=None, **kwargs):
--> 191 return TSVTaggingDataset(data, transform=transform, **kwargs)
192
193 def last_transform(self):
~/opt/anaconda3/lib/python3.8/site-packages/hanlp/datasets/ner/loaders/tsv.py in __init__(self, data, transform, cache, generate_idx, max_seq_len, sent_delimiter, char_level, hard_constraint, **kwargs)
43 self.sent_delimiter = sent_delimiter
44 self.max_seq_len = max_seq_len
---> 45 super().__init__(data, transform, cache, generate_idx)
46
47 def load_file(self, filepath):
~/opt/anaconda3/lib/python3.8/site-packages/hanlp/common/dataset.py in __init__(self, data, transform, cache, generate_idx)
126 if generate_idx is None:
127 generate_idx = isinstance(data, list)
--> 128 data_ = self.load_data(data, generate_idx)
129 # assert data_, f'No samples loaded from {data}'
130 if data_:
~/opt/anaconda3/lib/python3.8/site-packages/hanlp/common/dataset.py in load_data(self, data, generate_idx)
152 if isinstance(data, str):
153 data = get_resource(data)
--> 154 data = list(self.load_file(data))
155 if generate_idx:
156 for i, each in enumerate(data):
~/opt/anaconda3/lib/python3.8/site-packages/hanlp/datasets/ner/loaders/tsv.py in load_file(self, filepath)
70 filepath = get_resource(filepath)
71 # idx = 0
---> 72 for words, tags in generate_words_tags_from_tsv(filepath, lower=False):
73 # idx += 1
74 # if idx % 1000 == 0:
~/opt/anaconda3/lib/python3.8/site-packages/hanlp/utils/io_util.py in generate_words_tags_from_tsv(tsv_file_path, lower, gold, max_seq_length, sent_delimiter, char_level, hard_constraint)
462 tags = [cells[1] for cells in sent]
463 except:
--> 464 raise ValueError(f'Failed to load {tsv_file_path}: {sent}')
465 else:
466 tags = None
ValueError: Failed to load NER_Train.tsv: [['MEMORIAL', 'O'], ['of', 'O'], ['an', 'O'], ['instrument', 'O'], ['to', 'O'], ['be', 'O'], ['registered', 'O'], ['in', 'O'], ['the', 'O'], ['Land', 'O'],
The code I used is below:
recognizer.fit(train_dataset,
test_dataset,
save_dir,
epochs=100,
transformer='albert_base_zh',
finetune=hanlp.pretrained.ner.MSRA_NER_ELECTRA_SMALL_ZH)
Does it have to do with how I created the tsv file? Is there a recommended way?