diff --git a/pandas/io/common.py b/pandas/io/common.py index 1a9e6b472463d..25c66c3fa6d04 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -9,7 +9,9 @@ import codecs from collections import defaultdict from collections.abc import ( + Generator, Hashable, + Iterable, Mapping, Sequence, ) @@ -26,7 +28,10 @@ ) import mmap import os -from pathlib import Path +from pathlib import ( + Path, + PurePosixPath, +) import re import tarfile from typing import ( @@ -55,6 +60,7 @@ BaseBuffer, ReadCsvBuffer, ) +from pandas.compat import is_platform_windows from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level @@ -1282,3 +1288,133 @@ def dedup_names( counts[col] = cur_count + 1 return names + + +def _infer_protocol(path: str) -> str: + # Treat Windows drive letters like C:\ as local file paths + if is_platform_windows() and re.match(r"^[a-zA-Z]:[\\/]", path): + return "file" + + parsed = parse_url(path) + if parsed.scheme in _VALID_URLS: + return parsed.scheme + return "file" + + +def _match_file( + path: Path | PurePosixPath, extensions: set[str] | None, glob: str | None +) -> bool: + """Check if the file matches the given extensions and glob pattern. + Parameters + ---------- + path : Path or PurePosixPath + The file path to check. + extensions : set[str] + A set of file extensions to match against. + glob : str + A glob pattern to match against. + Returns + ------- + bool + True if the file matches the extensions and glob pattern, False otherwise. + """ + return (extensions is None or path.suffix.lower() in extensions) and ( + glob is None or path.match(glob) + ) + + +def iterdir( + path: FilePath, + extensions: str | Iterable[str] | None = None, + glob: str | None = None, +) -> Generator[Path | PurePosixPath]: + """Yield file paths in a directory (no nesting allowed). + + Supports: + - Local paths (str, os.PathLike) + - file:// URLs + - Remote paths (e.g., s3://) via fsspec (if installed) + + Parameters + ---------- + path : FilePath + Path to the directory (local or remote). + extensions : str or list of str, optional + Only yield files with the given extension(s). Case-insensitive. + If None, all files are yielded. + glob : str, optional + Only yield files matching the given glob pattern. + If None, all files are yielded. + + Yields + ------ + pathlib.Path or pathlib.PurePosixPath + File paths within the directory. + + Raises + ------ + NotADirectoryError + If the given path is not a directory. + ImportError + If fsspec is required but not installed. + """ + if extensions is not None: + if isinstance(extensions, str): + extensions = {extensions.lower()} + else: + extensions = {ext.lower() for ext in extensions} + + path_str = os.fspath(path) + scheme = _infer_protocol(path_str) + + if scheme == "file": + resolved_path = Path(path_str) + if resolved_path.is_file(): + if _match_file( + resolved_path, + extensions, + glob, + ): + yield resolved_path + return + + if not resolved_path.is_dir(): + raise NotADirectoryError( + f"Path {path!r} is neither a file nor a directory." + ) + + for entry in resolved_path.iterdir(): + if entry.is_file(): + if _match_file( + entry, + extensions, + glob, + ): + yield entry + return + + # Remote paths (e.g., s3) + fsspec = import_optional_dependency("fsspec", extra=scheme) + fs = fsspec.filesystem(scheme) + if fs.isfile(path): + path_obj = PurePosixPath(path) + if _match_file( + path_obj, + extensions, + glob, + ): + yield path_obj + return + if not fs.isdir(path): + raise NotADirectoryError(f"Path {path!r} is neither a file nor a directory.") + + files = fs.ls(path, detail=True) + for f in files: + if f["type"] == "file": + path_obj = PurePosixPath(f["name"]) + if _match_file( + path_obj, + extensions, + glob, + ): + yield path_obj diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index a5ddda9d66e7a..799fea1e97fec 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -223,3 +223,14 @@ def compression_format(request): @pytest.fixture(params=_compression_formats_params) def compression_ext(request): return request.param[0] + + +@pytest.fixture +def directory_with_dummy_csv(tmp_path): + """ + Fixture to create a directory with dummy CSV files for testing. + """ + for i in range(3): + file_path = tmp_path / f"file_{i}.csv" + file_path.touch() + return tmp_path diff --git a/pandas/tests/io/parser/test_directory.py b/pandas/tests/io/parser/test_directory.py new file mode 100644 index 0000000000000..84edc58570036 --- /dev/null +++ b/pandas/tests/io/parser/test_directory.py @@ -0,0 +1,37 @@ +from csv import ( + DictWriter, + reader as csv_reader, +) + +import pytest + + +@pytest.fixture +def directory_data(): + return ["a", "b", "c"], [ + {"first": {"a": 1, "b": 2, "c": 3}}, + {"second": {"a": 4, "b": 5, "c": 6}}, + {"third": {"a": 7, "b": 8, "c": 9}}, + ] + + +@pytest.fixture +def directory_data_to_file(tmp_path, directory_data): + field_names, data_list = directory_data + for data in data_list: + file_name = next(iter(data.keys())) + path = tmp_path / f"{file_name}.csv" + with path.open("w", newline="", encoding="utf-8") as file: + writer = DictWriter(file, fieldnames=field_names) + writer.writeheader() + writer.writerow(data[file_name]) + return tmp_path + + +def test_directory_data(directory_data_to_file): + assert len(list(directory_data_to_file.iterdir())) == 3 + for file in directory_data_to_file.iterdir(): + with file.open(encoding="utf-8") as f: + reader = csv_reader(f) + header = next(reader) + assert header == ["a", "b", "c"] diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 4a5e41397b59d..48523414578d3 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -695,3 +695,10 @@ def test_pyarrow_read_csv_datetime_dtype(): expect = pd.DataFrame({"date": expect_data}) tm.assert_frame_equal(expect, result) + + +def test_iterdir(directory_with_dummy_csv): + for file in icom.iterdir(directory_with_dummy_csv): + assert file.is_file() + assert file.name.startswith("file_") + assert file.suffix == ".csv" diff --git a/web/pandas/static/img/books/pandas_cookbook_3.jpeg b/web/pandas/static/img/books/pandas_cookbook_3.jpeg new file mode 100644 index 0000000000000..cf1c27037de68 Binary files /dev/null and b/web/pandas/static/img/books/pandas_cookbook_3.jpeg differ