English 中文(简体)
数值: 无效模式:**
原标题:ValueError: Invalid pattern: ** can only be an entire path component

我正试图对LLM进行微调。

我的法典:

from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
        Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

# load dataset
dataset = load_dataset( TokenBender/code_instructions_122k_alpaca_style )
dataset

错误:

    ---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In [12], line 2
      1 # load dataset
----> 2 dataset = load_dataset( TokenBender/code_instructions_122k_alpaca_style )
      3 dataset

File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1664, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, **config_kwargs)
   1661 ignore_verifications = ignore_verifications or save_infos
   1663 # Create a dataset builder
-> 1664 builder_instance = load_dataset_builder(
   1665     path=path,
   1666     name=name,
   1667     data_dir=data_dir,
   1668     data_files=data_files,
   1669     cache_dir=cache_dir,
   1670     features=features,
   1671     download_config=download_config,
   1672     download_mode=download_mode,
   1673     revision=revision,
   1674     use_auth_token=use_auth_token,
   1675     **config_kwargs,
   1676 )
   1678 # Return iterable dataset in case of streaming
   1679 if streaming:

File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1490, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, **config_kwargs)
   1488     download_config = download_config.copy() if download_config else DownloadConfig()
   1489     download_config.use_auth_token = use_auth_token
-> 1490 dataset_module = dataset_module_factory(
   1491     path,
   1492     revision=revision,
   1493     download_config=download_config,
   1494     download_mode=download_mode,
   1495     data_dir=data_dir,
   1496     data_files=data_files,
   1497 )
   1499 # Get dataset builder class from the processing script
   1500 builder_cls = import_main_class(dataset_module.module_path)

File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1242, in dataset_module_factory(path, revision, download_config, download_mode, force_local_path, dynamic_modules_path, data_dir, data_files, **download_kwargs)
   1237             if isinstance(e1, FileNotFoundError):
   1238                 raise FileNotFoundError(
   1239                     f"Couldn t find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
   1240                     f"Couldn t find  {path}  on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
   1241                 ) from None
-> 1242             raise e1 from None
   1243 else:
   1244     raise FileNotFoundError(
   1245         f"Couldn t find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory."
   1246     )

File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1223, in dataset_module_factory(path, revision, download_config, download_mode, force_local_path, dynamic_modules_path, data_dir, data_files, **download_kwargs)
   1215             return HubDatasetModuleFactoryWithScript(
   1216                 path,
   1217                 revision=revision,
   (...)
   1220                 dynamic_modules_path=dynamic_modules_path,
   1221             ).get_module()
   1222         else:
-> 1223             return HubDatasetModuleFactoryWithoutScript(
   1224                 path,
   1225                 revision=revision,
   1226                 data_dir=data_dir,
   1227                 data_files=data_files,
   1228                 download_config=download_config,
   1229                 download_mode=download_mode,
   1230             ).get_module()
   1231 except Exception as e1:  # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
   1232     try:

File /usr/local/lib/python3.9/dist-packages/datasets/load.py:846, in HubDatasetModuleFactoryWithoutScript.get_module(self)
    836     token = self.download_config.use_auth_token
    837 hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
    838     self.name,
    839     revision=self.revision,
    840     token=token,
    841     timeout=100.0,
    842 )
    843 patterns = (
    844     sanitize_patterns(self.data_files)
    845     if self.data_files is not None
--> 846     else get_patterns_in_dataset_repository(hfh_dataset_info)
    847 )
    848 data_files = DataFilesDict.from_hf_repo(
    849     patterns,
    850     dataset_info=hfh_dataset_info,
    851     allowed_extensions=ALL_ALLOWED_EXTENSIONS,
    852 )
    853 infered_module_names = {
    854     key: infer_module_for_data_files(data_files_list, use_auth_token=self.download_config.use_auth_token)
    855     for key, data_files_list in data_files.items()
    856 }

File /usr/local/lib/python3.9/dist-packages/datasets/data_files.py:471, in get_patterns_in_dataset_repository(dataset_info)
    469 resolver = partial(_resolve_single_pattern_in_dataset_repository, dataset_info)
    470 try:
--> 471     return _get_data_files_patterns(resolver)
    472 except FileNotFound错误:
    473     raise FileNotFoundError(
    474         f"The dataset repository at  {dataset_info.id}  doesn t contain any data file."
    475     ) from None

File /usr/local/lib/python3.9/dist-packages/datasets/data_files.py:99, in _get_data_files_patterns(pattern_resolver)
     97 try:
     98     for pattern in patterns:
---> 99         data_files = pattern_resolver(pattern)
    100         if len(data_files) > 0:
    101             non_empty_splits.append(split)

File /usr/local/lib/python3.9/dist-packages/datasets/data_files.py:303, in _resolve_single_pattern_in_dataset_repository(dataset_info, pattern, allowed_extensions)
    301 data_files_ignore = FILES_TO_IGNORE
    302 fs = HfFileSystem(repo_info=dataset_info)
--> 303 glob_iter = [PurePath(filepath) for filepath in fs.glob(PurePath(pattern).as_posix()) if fs.isfile(filepath)]
    304 matched_paths = [
    305     filepath
    306     for filepath in glob_iter
    307     if filepath.name not in data_files_ignore and not filepath.name.startswith(".")
    308 ]
    309 if allowed_extensions is not None:

File /usr/local/lib/python3.9/dist-packages/fsspec/spec.py:606, in AbstractFileSystem.glob(self, path, maxdepth, **kwargs)
    602         depth = None
    604 allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
--> 606 pattern = glob_translate(path + ("/" if ends_with_sep else ""))
    607 pattern = re.compile(pattern)
    609 out = {
    610     p: info
    611     for p, info in sorted(allpaths.items())
   (...)
    618     )
    619 }

File /usr/local/lib/python3.9/dist-packages/fsspec/utils.py:734, in glob_translate(pat)
    732     continue
    733 elif "**" in part:
--> 734     raise ValueError(
    735         "Invalid pattern:  **  can only be an entire path component"
    736     )
    737 if part:
    738     results.extend(_translate(part, f"{not_sep}*", not_sep))

Value错误: Invalid pattern:  **  can only be an entire path component

我试图在网上找到一些东西,发现这篇文章是

but I could not understand their solution. Can anyone help me in understanding the solution for the error I am facing. Thanks.

我的图书馆版本:

  • peft : 0.6.0
  • torch : 2.1.2+cu121
  • datasets : 2.1.0
  • transformers : 4.21.3
问题回答

出现这一错误的原因可能是:<代码>数据集包(有些在2.1至2.14之间)正在突破<代码>fspec。 在最新<代码>数据集/编码>发布后(2.15.0)中已经确定(见https://github.com/huggingface/datasets/issues/6352#issuecomment-1781073234”rel=“noreferer”>issues)。

更新您安装的<代码>pip 安装——U datasets,以贴现fspec

The solution works for datasets version 2.10.1 on Python 3.10, as it should update the package with a hotfix that was added for version > 2.15.0.

最新解决办法对我不可行,她想知道这一问题是否与我的数据集或编码有关,以装上数据集(我可以装上其他的一些数据集)。

from datasets import load_dataset                   
print( sarted )
gsm8k = load_dataset("gsm8k","main") #this works fine
gsm8k = load_dataset("jijivski/mock_gsm8k")#,download_mode="force_redownload")
# I met  Value错误: Invalid pattern:  **  can only be an entire path component  here
print( loaded )    
gsm8k

I Successfully installed datasets-2.15.0 fsspec-2023.10.0 pyarrow-hotfix-0.6 and the latest datasets 2.16.1, I met the same problem on online environment like kaggle.

如果我的数据集存在问题,那么如果能够说明问题,则会很奇怪。

错误:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[7], line 5
      2 get_ipython().system( pip install -U datasets )
      3 from datasets import load_dataset
----> 5 dataset = load_dataset("jijivski/mock_gsm8k")

File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1664, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, **config_kwargs)
   1661 ignore_verifications = ignore_verifications or save_infos
   1663 # Create a dataset builder
-> 1664 builder_instance = load_dataset_builder(
   1665     path=path,
   1666     name=name,
   1667     data_dir=data_dir,
   1668     data_files=data_files,
   1669     cache_dir=cache_dir,
   1670     features=features,
   1671     download_config=download_config,
   1672     download_mode=download_mode,
   1673     revision=revision,
   1674     use_auth_token=use_auth_token,
   1675     **config_kwargs,
   1676 )
   1678 # Return iterable dataset in case of streaming
   1679 if streaming:

File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1490, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, **config_kwargs)
   1488     download_config = download_config.copy() if download_config else DownloadConfig()
   1489     download_config.use_auth_token = use_auth_token
-> 1490 dataset_module = dataset_module_factory(
   1491     path,
   1492     revision=revision,
   1493     download_config=download_config,
   1494     download_mode=download_mode,
   1495     data_dir=data_dir,
   1496     data_files=data_files,
   1497 )
   1499 # Get dataset builder class from the processing script
   1500 builder_cls = import_main_class(dataset_module.module_path)

File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1242, in dataset_module_factory(path, revision, download_config, download_mode, force_local_path, dynamic_modules_path, data_dir, data_files, **download_kwargs)
   1237             if isinstance(e1, FileNotFoundError):
   1238                 raise FileNotFoundError(
   1239                     f"Couldn t find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
   1240                     f"Couldn t find  {path}  on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
   1241                 ) from None
-> 1242             raise e1 from None
   1243 else:
   1244     raise FileNotFoundError(
   1245         f"Couldn t find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory."
   1246     )

File /opt/conda/lib/python3.10/site-packages/datasets/load.py:1230, in dataset_module_factory(path, revision, download_config, download_mode, force_local_path, dynamic_modules_path, data_dir, data_files, **download_kwargs)
   1215             return HubDatasetModuleFactoryWithScript(
   1216                 path,
   1217                 revision=revision,
   (...)
   1220                 dynamic_modules_path=dynamic_modules_path,
   1221             ).get_module()
   1222         else:
   1223             return HubDatasetModuleFactoryWithoutScript(
   1224                 path,
   1225                 revision=revision,
   1226                 data_dir=data_dir,
   1227                 data_files=data_files,
   1228                 download_config=download_config,
   1229                 download_mode=download_mode,
-> 1230             ).get_module()
   1231 except Exception as e1:  # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
   1232     try:

File /opt/conda/lib/python3.10/site-packages/datasets/load.py:846, in HubDatasetModuleFactoryWithoutScript.get_module(self)
    836     token = self.download_config.use_auth_token
    837 hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
    838     self.name,
    839     revision=self.revision,
    840     token=token,
    841     timeout=100.0,
    842 )
    843 patterns = (
    844     sanitize_patterns(self.data_files)
    845     if self.data_files is not None
--> 846     else get_patterns_in_dataset_repository(hfh_dataset_info)
    847 )
    848 data_files = DataFilesDict.from_hf_repo(
    849     patterns,
    850     dataset_info=hfh_dataset_info,
    851     allowed_extensions=ALL_ALLOWED_EXTENSIONS,
    852 )
    853 infered_module_names = {
    854     key: infer_module_for_data_files(data_files_list, use_auth_token=self.download_config.use_auth_token)
    855     for key, data_files_list in data_files.items()
    856 }

File /opt/conda/lib/python3.10/site-packages/datasets/data_files.py:471, in get_patterns_in_dataset_repository(dataset_info)
    469 resolver = partial(_resolve_single_pattern_in_dataset_repository, dataset_info)
    470 try:
--> 471     return _get_data_files_patterns(resolver)
    472 except FileNotFound错误:
    473     raise FileNotFoundError(
    474         f"The dataset repository at  {dataset_info.id}  doesn t contain any data file."
    475     ) from None

File /opt/conda/lib/python3.10/site-packages/datasets/data_files.py:99, in _get_data_files_patterns(pattern_resolver)
     97 try:
     98     for pattern in patterns:
---> 99         data_files = pattern_resolver(pattern)
    100         if len(data_files) > 0:
    101             non_empty_splits.append(split)

File /opt/conda/lib/python3.10/site-packages/datasets/data_files.py:303, in _resolve_single_pattern_in_dataset_repository(dataset_info, pattern, allowed_extensions)
    301 data_files_ignore = FILES_TO_IGNORE
    302 fs = HfFileSystem(repo_info=dataset_info)
--> 303 glob_iter = [PurePath(filepath) for filepath in fs.glob(PurePath(pattern).as_posix()) if fs.isfile(filepath)]
    304 matched_paths = [
    305     filepath
    306     for filepath in glob_iter
    307     if filepath.name not in data_files_ignore and not filepath.name.startswith(".")
    308 ]
    309 if allowed_extensions is not None:

File /opt/conda/lib/python3.10/site-packages/fsspec/spec.py:606, in AbstractFileSystem.glob(self, path, maxdepth, **kwargs)
    602         depth = None
    604 allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
--> 606 pattern = glob_translate(path + ("/" if ends_with_sep else ""))
    607 pattern = re.compile(pattern)
    609 out = {
    610     p: info
    611     for p, info in sorted(allpaths.items())
   (...)
    618     )
    619 }

File /opt/conda/lib/python3.10/site-packages/fsspec/utils.py:734, in glob_translate(pat)
    732     continue
    733 elif "**" in part:
--> 734     raise ValueError(
    735         "Invalid pattern:  **  can only be an entire path component"
    736     )
    737 if part:
    738     results.extend(_translate(part, f"{not_sep}*", not_sep))

Value错误: Invalid pattern:  **  can only be an entire path component




相关问题
Can Django models use MySQL functions?

Is there a way to force Django models to pass a field to a MySQL function every time the model data is read or loaded? To clarify what I mean in SQL, I want the Django model to produce something like ...

An enterprise scheduler for python (like quartz)

I am looking for an enterprise tasks scheduler for python, like quartz is for Java. Requirements: Persistent: if the process restarts or the machine restarts, then all the jobs must stay there and ...

How to remove unique, then duplicate dictionaries in a list?

Given the following list that contains some duplicate and some unique dictionaries, what is the best method to remove unique dictionaries first, then reduce the duplicate dictionaries to single ...

What is suggested seed value to use with random.seed()?

Simple enough question: I m using python random module to generate random integers. I want to know what is the suggested value to use with the random.seed() function? Currently I am letting this ...

How can I make the PyDev editor selectively ignore errors?

I m using PyDev under Eclipse to write some Jython code. I ve got numerous instances where I need to do something like this: import com.work.project.component.client.Interface.ISubInterface as ...

How do I profile `paster serve` s startup time?

Python s paster serve app.ini is taking longer than I would like to be ready for the first request. I know how to profile requests with middleware, but how do I profile the initialization time? I ...

Pragmatically adding give-aways/freebies to an online store

Our business currently has an online store and recently we ve been offering free specials to our customers. Right now, we simply display the special and give the buyer a notice stating we will add the ...

Converting Dictionary to List? [duplicate]

I m trying to convert a Python dictionary into a Python list, in order to perform some calculations. #My dictionary dict = {} dict[ Capital ]="London" dict[ Food ]="Fish&Chips" dict[ 2012 ]="...