I’m trying to tokenize a document containing both Chinese and English characters. Want to first leverage on the pre-trained model, and pair with a dictionary for custom words to tokenize specific entities correctly for NER training.
Below is the code:
HanLP = hanlp.load(hanlp.pretrained.mtl.UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE) # Multi-Language
Below is the error output:
Failed to load https://file.hankcs.com/hanlp/mtl/ud_ontonotes_tok_pos_lem_fea_ner_srl_dep_sdp_con_xlm_base_20220608_003435.zip
Please upgrade HanLP with:
pip install --upgrade hanlp
If the problem still persists, please submit an issue to https://github.com/hankcs/HanLP/issues
When reporting an issue, make sure to paste the FULL ERROR LOG below.
================================ERROR LOG BEGINS================================
OS: macOS-10.16-x86_64-i386-64bit
Python: 3.8.8
PyTorch: 1.10.2
HanLP: 2.1.0-beta.45
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-179-2be7a6b9aada> in <module>
1 # HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH) <- Think only handle Chinese
----> 2 HanLP = hanlp.load(hanlp.pretrained.mtl.UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE) # Multi-Language
~/opt/anaconda3/lib/python3.8/site-packages/hanlp/__init__.py in load(save_dir, verbose, **kwargs)
41 from hanlp_common.constant import HANLP_VERBOSE
42 verbose = HANLP_VERBOSE
---> 43 return load_from_meta_file(save_dir, 'meta.json', verbose=verbose, **kwargs)
44
45
~/opt/anaconda3/lib/python3.8/site-packages/hanlp/utils/component_util.py in load_from_meta_file(save_dir, meta_filename, transform_only, verbose, **kwargs)
180 if isinstance(e.args[i], str):
181 from hanlp_common.util import set_tuple_with
--> 182 e.args = set_tuple_with(e.args, e.args[i] + f'\n{"ERROR LOG ENDS":=^80}', i)
183 break
184 except:
~/opt/anaconda3/lib/python3.8/site-packages/hanlp/utils/component_util.py in load_from_meta_file(save_dir, meta_filename, transform_only, verbose, **kwargs)
104 else:
105 if os.path.isfile(os.path.join(save_dir, 'config.json')):
--> 106 obj.load(save_dir, verbose=verbose, **kwargs)
107 else:
108 obj.load(metapath, **kwargs)
~/opt/anaconda3/lib/python3.8/site-packages/hanlp/common/torch_component.py in load(self, save_dir, devices, verbose, **kwargs)
171 if devices is None and self.model:
172 devices = self.devices
--> 173 self.load_config(save_dir, **kwargs)
174 self.load_vocabs(save_dir)
175 if verbose:
~/opt/anaconda3/lib/python3.8/site-packages/hanlp/common/torch_component.py in load_config(self, save_dir, filename, **kwargs)
123 for k, v in self.config.items():
124 if isinstance(v, dict) and 'classpath' in v:
--> 125 self.config[k] = Configurable.from_config(v)
126 self.on_config_ready(**self.config, save_dir=save_dir)
127
~/opt/anaconda3/lib/python3.8/site-packages/hanlp_common/configurable.py in from_config(config, **kwargs)
28 if cls.from_config == Configurable.from_config:
29 deserialized_config.pop('classpath')
---> 30 return cls(**deserialized_config)
31 else:
32 return cls.from_config(deserialized_config)
~/opt/anaconda3/lib/python3.8/site-packages/hanlp/layers/embeddings/contextual_word_embedding.py in __init__(self, field, transformer, average_subwords, scalar_mix, word_dropout, max_sequence_length, truncate_long_sequences, cls_is_bos, sep_is_eos, ret_token_span, ret_subtokens, ret_subtokens_group, ret_prefix_mask, ret_raw_hidden_states, transformer_args, use_fast, do_basic_tokenize, trainable)
139 self.transformer = transformer
140 self.field = field
--> 141 self._transformer_tokenizer = AutoTokenizer_.from_pretrained(self.transformer,
142 use_fast=use_fast,
143 do_basic_tokenize=do_basic_tokenize)
~/opt/anaconda3/lib/python3.8/site-packages/hanlp/layers/transformers/pt_imports.py in from_pretrained(cls, pretrained_model_name_or_path, use_fast, do_basic_tokenize)
66 if use_fast and not do_basic_tokenize:
67 warnings.warn('`do_basic_tokenize=False` might not work when `use_fast=True`')
---> 68 tokenizer = cls.from_pretrained(get_tokenizer_mirror(transformer), use_fast=use_fast,
69 do_basic_tokenize=do_basic_tokenize,
70 **additional_config)
~/opt/anaconda3/lib/python3.8/site-packages/transformers/models/auto/tokenization_auto.py in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
528 facebook/rag-token-base), specify it here.
529 use_fast (`bool`, *optional*, defaults to `True`):
--> 530 Use a [fast Rust-based tokenizer](https://huggingface.co/docs/tokenizers/index) if it is supported for
531 a given model. If a fast tokenizer is not available for a given model, a normal Python-based tokenizer
532 is returned instead.
~/opt/anaconda3/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py in __getitem__(self, key)
565 else:
566 raise ValueError(f"Could not find {attr} in {transformers_module}!")
--> 567
568
569 class _LazyAutoMapping(OrderedDict):
~/opt/anaconda3/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py in _load_attr_from_module(self, model_type, attr)
571 " A mapping config to object (model or tokenizer for instance) that will load keys and values when it is accessed.
572
--> 573 Args:
574 - config_mapping: The map model type to config class
575 - model_mapping: The map model type to model (or tokenizer) class
~/opt/anaconda3/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py in getattribute_from_module(module, attr)
532 from_pretrained.__doc__ = from_pretrained_docstring
533 from_pretrained = replace_list_option_in_docstrings(model_mapping._model_mapping)(from_pretrained)
--> 534 cls.from_pretrained = classmethod(from_pretrained)
535 return cls
536
~/opt/anaconda3/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py in <genexpr>(.0)
532 from_pretrained.__doc__ = from_pretrained_docstring
533 from_pretrained = replace_list_option_in_docstrings(model_mapping._model_mapping)(from_pretrained)
--> 534 cls.from_pretrained = classmethod(from_pretrained)
535 return cls
536
~/opt/anaconda3/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py in getattribute_from_module(module, attr)
533 from_pretrained = replace_list_option_in_docstrings(model_mapping._model_mapping)(from_pretrained)
534 cls.from_pretrained = classmethod(from_pretrained)
--> 535 return cls
536
537
~/opt/anaconda3/lib/python3.8/site-packages/transformers/file_utils.py in __getattr__(self, name)
~/opt/anaconda3/lib/python3.8/site-packages/transformers/file_utils.py in _get_module(self, module_name)
RuntimeError: Failed to import transformers.models.xlm_roberta.tokenization_xlm_roberta_fast because of the following error (look up to see its traceback):
cannot import name 'is_sentencepiece_available' from 'transformers.utils' (/Users/nchan94/opt/anaconda3/lib/python3.8/site-packages/transformers/utils/__init__.py)
=================================ERROR LOG ENDS=================================
I have tried doing pip install --upgrade hanlp
and try again but still failed. I also tried with different multi-language pre-trained models such as the ones below found on your site:
hanlp.pretrained.mtl.UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MMINILMV2L6
hanlp.pretrained.mtl.UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_MMINILMV2L12
Let me know what I can do. Thanks!