Question

我正试图对LLM进行微调。

我的法典:

from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
        Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

# load dataset
dataset = load_dataset( TokenBender/code_instructions_122k_alpaca_style )
dataset

错误:

    ---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In [12], line 2
      1 # load dataset
----> 2 dataset = load_dataset( TokenBender/code_instructions_122k_alpaca_style )
      3 dataset

File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1664, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, **config_kwargs)
   1661 ignore_verifications = ignore_verifications or save_infos
   1663 # Create a dataset builder
-> 1664 builder_instance = load_dataset_builder(
   1665     path=path,
   1666     name=name,
   1667     data_dir=data_dir,
   1668     data_files=data_files,
   1669     cache_dir=cache_dir,
   1670     features=features,
   1671     download_config=download_config,
   1672     download_mode=download_mode,
   1673     revision=revision,
   1674     use_auth_token=use_auth_token,
   1675     **config_kwargs,
   1676 )
   1678 # Return iterable dataset in case of streaming
   1679 if streaming:

File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1490, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, **config_kwargs)
   1488     download_config = download_config.copy() if download_config else DownloadConfig()
   1489     download_config.use_auth_token = use_auth_token
-> 1490 dataset_module = dataset_module_factory(
   1491     path,
   1492     revision=revision,
   1493     download_config=download_config,
   1494     download_mode=download_mode,
   1495     data_dir=data_dir,
   1496     data_files=data_files,
   1497 )
   1499 # Get dataset builder class from the processing script
   1500 builder_cls = import_main_class(dataset_module.module_path)

File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1242, in dataset_module_factory(path, revision, download_config, download_mode, force_local_path, dynamic_modules_path, data_dir, data_files, **download_kwargs)
   1237             if isinstance(e1, FileNotFoundError):
   1238                 raise FileNotFoundError(
   1239                     f"Couldn t find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
   1240                     f"Couldn t find  {path}  on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
   1241                 ) from None
-> 1242             raise e1 from None
   1243 else:
   1244     raise FileNotFoundError(
   1245         f"Couldn t find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory."
   1246     )

File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1223, in dataset_module_factory(path, revision, download_config, download_mode, force_local_path, dynamic_modules_path, data_dir, data_files, **download_kwargs)
   1215             return HubDatasetModuleFactoryWithScript(
   1216                 path,
   1217                 revision=revision,
   (...)
   1220                 dynamic_modules_path=dynamic_modules_path,
   1221             ).get_module()
   1222         else:
-> 1223             return HubDatasetModuleFactoryWithoutScript(
   1224                 path,
   1225                 revision=revision,
   1226                 data_dir=data_dir,
   1227                 data_files=data_files,
   1228                 download_config=download_config,
   1229                 download_mode=download_mode,
   1230             ).get_module()
   1231 except Exception as e1:  # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
   1232     try:

File /usr/local/lib/python3.9/dist-packages/datasets/load.py:846, in HubDatasetModuleFactoryWithoutScript.get_module(self)
    836     token = self.download_config.use_auth_token
    837 hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
    838     self.name,
    839     revision=self.revision,
    840     token=token,
    841     timeout=100.0,
    842 )
    843 patterns = (
    844     sanitize_patterns(self.data_files)
    845     if self.data_files is not None
--> 846     else get_patterns_in_dataset_repository(hfh_dataset_info)
    847 )
    848 data_files = DataFilesDict.from_hf_repo(
    849     patterns,
    850     dataset_info=hfh_dataset_info,
    851     allowed_extensions=ALL_ALLOWED_EXTENSIONS,
    852 )
    853 infered_module_names = {
    854     key: infer_module_for_data_files(data_files_list, use_auth_token=self.download_config.use_auth_token)
    855     for key, data_files_list in data_files.items()
    856 }

File /usr/local/lib/python3.9/dist-packages/datasets/data_files.py:471, in get_patterns_in_dataset_repository(dataset_info)
    469 resolver = partial(_resolve_single_pattern_in_dataset_repository, dataset_info)
    470 try:
--> 471     return _get_data_files_patterns(resolver)
    472 except FileNotFound错误:
    473     raise FileNotFoundError(
    474         f"The dataset repository at  {dataset_info.id}  doesn t contain any data file."
    475     ) from None

File /usr/local/lib/python3.9/dist-packages/datasets/data_files.py:99, in _get_data_files_patterns(pattern_resolver)
     97 try:
     98     for pattern in patterns:
---> 99         data_files = pattern_resolver(pattern)
    100         if len(data_files) > 0:
    101             non_empty_splits.append(split)

File /usr/local/lib/python3.9/dist-packages/datasets/data_files.py:303, in _resolve_single_pattern_in_dataset_repository(dataset_info, pattern, allowed_extensions)
    301 data_files_ignore = FILES_TO_IGNORE
    302 fs = HfFileSystem(repo_info=dataset_info)
--> 303 glob_iter = [PurePath(filepath) for filepath in fs.glob(PurePath(pattern).as_posix()) if fs.isfile(filepath)]
    304 matched_paths = [
    305     filepath
    306     for filepath in glob_iter
    307     if filepath.name not in data_files_ignore and not filepath.name.startswith(".")
    308 ]
    309 if allowed_extensions is not None:

File /usr/local/lib/python3.9/dist-packages/fsspec/spec.py:606, in AbstractFileSystem.glob(self, path, maxdepth, **kwargs)
    602         depth = None
    604 allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
--> 606 pattern = glob_translate(path + ("/" if ends_with_sep else ""))
    607 pattern = re.compile(pattern)
    609 out = {
    610     p: info
    611     for p, info in sorted(allpaths.items())
   (...)
    618     )
    619 }

File /usr/local/lib/python3.9/dist-packages/fsspec/utils.py:734, in glob_translate(pat)
    732     continue
    733 elif "**" in part:
--> 734     raise ValueError(
    735         "Invalid pattern:  **  can only be an entire path component"
    736     )
    737 if part:
    738     results.extend(_translate(part, f"{not_sep}*", not_sep))

Value错误: Invalid pattern:  **  can only be an entire path component

我试图在网上找到一些东西,发现这篇文章是。

but I could not understand their solution. Can anyone help me in understanding the solution for the error I am facing. Thanks.

我的图书馆版本:

peft : 0.6.0
torch : 2.1.2+cu121
datasets : 2.1.0
transformers : 4.21.3

Answer 1

出现这一错误的原因可能是:<代码>数据集包(有些在2.1至2.14之间)正在突破<代码>fspec。在最新<代码>数据集/编码>发布后(2.15.0)中已经确定(见 https://github.com/huggingface/datasets/issues/6352#issuecomment-1781073234”rel=“noreferer”>issues)。

更新您安装的<代码>pip 安装——U datasets,以贴现fspec。


The solution works for datasets version 2.10.1 on Python 3.10, as it should  update the package with a hotfix that was added for version > 2.15.0.

Answer 2

最新解决办法对我不可行,她想知道这一问题是否与我的数据集或编码有关,以装上数据集(我可以装上其他的一些数据集)。

from datasets import load_dataset                   
print( sarted )
gsm8k = load_dataset("gsm8k","main") #this works fine
gsm8k = load_dataset("jijivski/mock_gsm8k")#,download_mode="force_redownload")
# I met  Value错误: Invalid pattern:  **  can only be an entire path component  here
print( loaded )    
gsm8k

I Successfully installed datasets-2.15.0 fsspec-2023.10.0 pyarrow-hotfix-0.6 and the latest datasets 2.16.1, I met the same problem on online environment like kaggle.

如果我的数据集存在问题,那么如果能够说明问题,则会很奇怪。

错误:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[7], line 5
      2 get_ipython().system( pip install -U datasets )
      3 from datasets import load_dataset
----> 5 dataset = load_dataset("jijivski/mock_gsm8k")

File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1664, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, **config_kwargs)
   1661 ignore_verifications = ignore_verifications or save_infos
   1663 # Create a dataset builder
-> 1664 builder_instance = load_dataset_builder(
   1665     path=path,
   1666     name=name,
   1667     data_dir=data_dir,
   1668     data_files=data_files,
   1669     cache_dir=cache_dir,
   1670     features=features,
   1671     download_config=download_config,
   1672     download_mode=download_mode,
   1673     revision=revision,
   1674     use_auth_token=use_auth_token,
   1675     **config_kwargs,
   1676 )
   1678 # Return iterable dataset in case of streaming
   1679 if streaming:

File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1490, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, **config_kwargs)
   1488     download_config = download_config.copy() if download_config else DownloadConfig()
   1489     download_config.use_auth_token = use_auth_token
-> 1490 dataset_module = dataset_module_factory(
   1491     path,
   1492     revision=revision,
   1493     download_config=download_config,
   1494     download_mode=download_mode,
   1495     data_dir=data_dir,
   1496     data_files=data_files,
   1497 )
   1499 # Get dataset builder class from the processing script
   1500 builder_cls = import_main_class(dataset_module.module_path)

File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1242, in dataset_module_factory(path, revision, download_config, download_mode, force_local_path, dynamic_modules_path, data_dir, data_files, **download_kwargs)
   1237             if isinstance(e1, FileNotFoundError):
   1238                 raise FileNotFoundError(
   1239                     f"Couldn t find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
   1240                     f"Couldn t find  {path}  on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
   1241                 ) from None
-> 1242             raise e1 from None
   1243 else:
   1244     raise FileNotFoundError(
   1245         f"Couldn t find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory."
   1246     )

File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1230, in dataset_module_factory(path, revision, download_config, download_mode, force_local_path, dynamic_modules_path, data_dir, data_files, **download_kwargs)
   1215             return HubDatasetModuleFactoryWithScript(
   1216                 path,
   1217                 revision=revision,
   (...)
   1220                 dynamic_modules_path=dynamic_modules_path,
   1221             ).get_module()
   1222         else:
   1223             return HubDatasetModuleFactoryWithoutScript(
   1224                 path,
   1225                 revision=revision,
   1226                 data_dir=data_dir,
   1227                 data_files=data_files,
   1228                 download_config=download_config,
   1229                 download_mode=download_mode,
-> 1230             ).get_module()
   1231 except Exception as e1:  # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
   1232     try:

File /opt/conda/lib/python3.10/site-packages/datasets/load.py:846, in HubDatasetModuleFactoryWithoutScript.get_module(self)
    836     token = self.download_config.use_auth_token
    837 hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
    838     self.name,
    839     revision=self.revision,
    840     token=token,
    841     timeout=100.0,
    842 )
    843 patterns = (
    844     sanitize_patterns(self.data_files)
    845     if self.data_files is not None
--> 846     else get_patterns_in_dataset_repository(hfh_dataset_info)
    847 )
    848 data_files = DataFilesDict.from_hf_repo(
    849     patterns,
    850     dataset_info=hfh_dataset_info,
    851     allowed_extensions=ALL_ALLOWED_EXTENSIONS,
    852 )
    853 infered_module_names = {
    854     key: infer_module_for_data_files(data_files_list, use_auth_token=self.download_config.use_auth_token)
    855     for key, data_files_list in data_files.items()
    856 }

File /opt/conda/lib/python3.10/site-packages/datasets/data_files.py:471, in get_patterns_in_dataset_repository(dataset_info)
    469 resolver = partial(_resolve_single_pattern_in_dataset_repository, dataset_info)
    470 try:
--> 471     return _get_data_files_patterns(resolver)
    472 except FileNotFound错误:
    473     raise FileNotFoundError(
    474         f"The dataset repository at  {dataset_info.id}  doesn t contain any data file."
    475     ) from None

File /opt/conda/lib/python3.10/site-packages/datasets/data_files.py:99, in _get_data_files_patterns(pattern_resolver)
     97 try:
     98     for pattern in patterns:
---> 99         data_files = pattern_resolver(pattern)
    100         if len(data_files) > 0:
    101             non_empty_splits.append(split)

File /opt/conda/lib/python3.10/site-packages/datasets/data_files.py:303, in _resolve_single_pattern_in_dataset_repository(dataset_info, pattern, allowed_extensions)
    301 data_files_ignore = FILES_TO_IGNORE
    302 fs = HfFileSystem(repo_info=dataset_info)
--> 303 glob_iter = [PurePath(filepath) for filepath in fs.glob(PurePath(pattern).as_posix()) if fs.isfile(filepath)]
    304 matched_paths = [
    305     filepath
    306     for filepath in glob_iter
    307     if filepath.name not in data_files_ignore and not filepath.name.startswith(".")
    308 ]
    309 if allowed_extensions is not None:

File /opt/conda/lib/python3.10/site-packages/fsspec/spec.py:606, in AbstractFileSystem.glob(self, path, maxdepth, **kwargs)
    602         depth = None
    604 allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
--> 606 pattern = glob_translate(path + ("/" if ends_with_sep else ""))
    607 pattern = re.compile(pattern)
    609 out = {
    610     p: info
    611     for p, info in sorted(allpaths.items())
   (...)
    618     )
    619 }

File /opt/conda/lib/python3.10/site-packages/fsspec/utils.py:734, in glob_translate(pat)
    732     continue
    733 elif "**" in part:
--> 734     raise ValueError(
    735         "Invalid pattern:  **  can only be an entire path component"
    736     )
    737 if part:
    738     results.extend(_translate(part, f"{not_sep}*", not_sep))

Value错误: Invalid pattern:  **  can only be an entire path component

友情链接