Pandas version checks

  • [X] I have checked that this issue has not already been reported.

  • [X] I have confirmed this bug exists on the latest version of pandas.

  • [X] I have confirmed this bug exists on the main branch of pandas.

Reproducible Example

import pandas as pd
# Reading multiple .parquet file from directory 
df = pd.read_parquet("directory")

Issue Description

assume we are going read multiple file from directory and format is .parquet , when try it If first file has column with None value error reported : "ArrowNotImplementedError: Unsupported cast from int64 to null using function cast_null" I change first file and replace it with file without None value column and this Error removed.

Expected Behavior

Not crashing

Installed Versions

[e:\Anaconda](file:///E:/Anaconda) File\Lib\site-packages\_distutils_hack\__init__.py:33: UserWarning: Setuptools is replacing distutils. warnings.warn("Setuptools is replacing distutils.") INSTALLED VERSIONS ------------------ commit : a671b5a8bf5dd13fb19f0e88edc679bc9e15c673 python : 3.11.7.final.0 python-bits : 64 OS : Windows OS-release : 10 Version : 10.0.22000 machine : AMD64 processor : Intel64 Family 6 Model 140 Stepping 1, GenuineIntel byteorder : little LC_ALL : None LANG : None LOCALE : English_United States.1252 pandas : 2.1.4 numpy : 1.26.4 pytz : 2023.3.post1 dateutil : 2.8.2 setuptools : 68.2.2 pip : 23.3.1 Cython : None pytest : 7.4.0 hypothesis : None sphinx : 5.0.2 blosc : None feather : None xlsxwriter : None lxml.etree : 4.9.3 html5lib : None pymysql : None psycopg2 : None jinja2 : 3.1.3 IPython : 8.20.0 pandas_datareader : None bs4 : 4.12.2 bottleneck : 1.3.7 dataframe-api-compat: None fastparquet : None fsspec : 2023.10.0 gcsfs : None matplotlib : 3.8.0 numba : 0.59.0 numexpr : 2.8.7 odfpy : None openpyxl : 3.0.10 pandas_gbq : None pyarrow : 14.0.2 pyreadstat : None pyxlsb : None s3fs : 2023.10.0 scipy : 1.11.4 sqlalchemy : 2.0.25 tables : 3.9.2 tabulate : 0.9.0 xarray : 2023.6.0 xlrd : None zstandard : 0.19.0 tzdata : 2023.3 qtpy : 2.4.1 pyqt5 : None

Comment From: mroeschke

Thanks for the report, but we would need a reproducible example (using .to_parquet & read_parquet) in order to further debug this issue.

Comment From: Saeid696

Here is some code. hope be useful

import pandas as pd import os

making path for multi file

os.makedirs('folder/subfolder', exist_ok=True)

First .parquet file

df01 = pd.DataFrame({'name': 'saeid', 'last_name': 'Gorbaniyan', 'age': None, 'city': 'maragheh', 'gender': "M"}, index=[0]) df01.to_parquet('folder/subfolder/df01.parquet', engine='pyarrow')

Second .parquet file

df02 = pd.DataFrame({'name': 'saeid', 'last_name': 'Gorbaniyan', 'age': 28, 'city': 'maragheh', 'gender': "M"}, index=[0]) df02.to_parquet('folder/subfolder/df02.parquet', engine='pyarrow')

Reading from directory

df = pd.read_parquet('./folder/subfolder')

df

And here Error appeared


ArrowNotImplementedError Traceback (most recent call last) Cell In[13], line 1 ----> 1 df = pd.read_parquet('./folder/subfolder') 2 df

File e:\Anaconda File\Lib\site-packages\pandas\io\parquet.py:670, in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, filesystem, filters, kwargs) 667 use_nullable_dtypes = False 668 check_dtype_backend(dtype_backend) --> 670 return impl.read( 671 path, 672 columns=columns, 673 filters=filters, 674 storage_options=storage_options, 675 use_nullable_dtypes=use_nullable_dtypes, 676 dtype_backend=dtype_backend, 677 filesystem=filesystem, 678 kwargs, 679 )

File e:\Anaconda File\Lib\site-packages\pandas\io\parquet.py:272, in PyArrowImpl.read(self, path, columns, filters, use_nullable_dtypes, dtype_backend, storage_options, filesystem, kwargs) 265 path_or_handle, handles, filesystem = _get_path_or_handle( 266 path, 267 filesystem, 268 storage_options=storage_options, 269 mode="rb", 270 ) 271 try: --> 272 pa_table = self.api.parquet.read_table( 273 path_or_handle, 274 columns=columns, 275 filesystem=filesystem, 276 filters=filters, 277 kwargs, 278 ) 279 result = pa_table.to_pandas(**to_pandas_kwargs) 281 if manager == "array":

File e:\Anaconda File\Lib\site-packages\pyarrow\parquet\core.py:3003, in read_table(source, columns, use_threads, metadata, schema, use_pandas_metadata, read_dictionary, memory_map, buffer_size, partitioning, filesystem, filters, use_legacy_dataset, ignore_prefixes, pre_buffer, coerce_int96_timestamp_unit, decryption_properties, thrift_string_size_limit, thrift_container_size_limit) 2992 # TODO test that source is not a directory or a list 2993 dataset = ParquetFile( 2994 source, metadata=metadata, read_dictionary=read_dictionary, 2995 memory_map=memory_map, buffer_size=buffer_size, (...) 3000 thrift_container_size_limit=thrift_container_size_limit, 3001 ) -> 3003 return dataset.read(columns=columns, use_threads=use_threads, 3004 use_pandas_metadata=use_pandas_metadata) 3006 warnings.warn( 3007 "Passing 'use_legacy_dataset=True' to get the legacy behaviour is " 3008 "deprecated as of pyarrow 8.0.0, and the legacy implementation will " 3009 "be removed in a future version.", 3010 FutureWarning, stacklevel=2) 3012 if ignore_prefixes is not None:

File e:\Anaconda File\Lib\site-packages\pyarrow\parquet\core.py:2631, in _ParquetDatasetV2.read(self, columns, use_threads, use_pandas_metadata) 2623 index_columns = [ 2624 col for col in _get_pandas_index_columns(metadata) 2625 if not isinstance(col, dict) 2626 ] 2627 columns = ( 2628 list(columns) + list(set(index_columns) - set(columns)) 2629 ) -> 2631 table = self._dataset.to_table( 2632 columns=columns, filter=self._filter_expression, 2633 use_threads=use_threads 2634 ) 2636 # if use_pandas_metadata, restore the pandas metadata (which gets 2637 # lost if doing a specific columns selection in to_table) 2638 if use_pandas_metadata:

File e:\Anaconda File\Lib\site-packages\pyarrow_dataset.pyx:556, in pyarrow._dataset.Dataset.to_table()

File e:\Anaconda File\Lib\site-packages\pyarrow_dataset.pyx:3713, in pyarrow._dataset.Scanner.to_table()

File e:\Anaconda File\Lib\site-packages\pyarrow\error.pxi:154, in pyarrow.lib.pyarrow_internal_check_status()

File e:\Anaconda File\Lib\site-packages\pyarrow\error.pxi:91, in pyarrow.lib.check_status()

ArrowNotImplementedError: Unsupported cast from int64 to null using function cast_null

Comment From: Saeid696

Here is some code. hope be useful

import pandas as pd import os

making path for multi file

os.makedirs('folder/subfolder', exist_ok=True)

First .parquet file

df01 = pd.DataFrame({'name': 'saeid', 'last_name': 'Gorbaniyan', 'age': None, 'city': 'maragheh', 'gender': "M"}, index=[0]) df01.to_parquet('folder/subfolder/df01.parquet', engine='pyarrow')

Second .parquet file

df02 = pd.DataFrame({'name': 'saeid', 'last_name': 'Gorbaniyan', 'age': 28, 'city': 'maragheh', 'gender': "M"}, index=[0]) df02.to_parquet('folder/subfolder/df02.parquet', engine='pyarrow')

Reading from directory

df = pd.read_parquet('./folder/subfolder')

df

And here Error appeared


ArrowNotImplementedError Traceback (most recent call last) Cell In[13], line 1 ----> 1 df = pd.read_parquet('./folder/subfolder') 2 df

File e:\Anaconda File\Lib\site-packages\pandas\io\parquet.py:670, in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, filesystem, filters, kwargs) 667 use_nullable_dtypes = False 668 check_dtype_backend(dtype_backend) --> 670 return impl.read( 671 path, 672 columns=columns, 673 filters=filters, 674 storage_options=storage_options, 675 use_nullable_dtypes=use_nullable_dtypes, 676 dtype_backend=dtype_backend, 677 filesystem=filesystem, 678 kwargs, 679 )

File e:\Anaconda File\Lib\site-packages\pandas\io\parquet.py:272, in PyArrowImpl.read(self, path, columns, filters, use_nullable_dtypes, dtype_backend, storage_options, filesystem, kwargs) 265 path_or_handle, handles, filesystem = _get_path_or_handle( 266 path, 267 filesystem, 268 storage_options=storage_options, 269 mode="rb", 270 ) 271 try: --> 272 pa_table = self.api.parquet.read_table( 273 path_or_handle, 274 columns=columns, 275 filesystem=filesystem, 276 filters=filters, 277 kwargs, 278 ) 279 result = pa_table.to_pandas(**to_pandas_kwargs) 281 if manager == "array":

File e:\Anaconda File\Lib\site-packages\pyarrow\parquet\core.py:3003, in read_table(source, columns, use_threads, metadata, schema, use_pandas_metadata, read_dictionary, memory_map, buffer_size, partitioning, filesystem, filters, use_legacy_dataset, ignore_prefixes, pre_buffer, coerce_int96_timestamp_unit, decryption_properties, thrift_string_size_limit, thrift_container_size_limit) 2992 # TODO test that source is not a directory or a list 2993 dataset = ParquetFile( 2994 source, metadata=metadata, read_dictionary=read_dictionary, 2995 memory_map=memory_map, buffer_size=buffer_size, (...) 3000 thrift_container_size_limit=thrift_container_size_limit, 3001 ) -> 3003 return dataset.read(columns=columns, use_threads=use_threads, 3004 use_pandas_metadata=use_pandas_metadata) 3006 warnings.warn( 3007 "Passing 'use_legacy_dataset=True' to get the legacy behaviour is " 3008 "deprecated as of pyarrow 8.0.0, and the legacy implementation will " 3009 "be removed in a future version.", 3010 FutureWarning, stacklevel=2) 3012 if ignore_prefixes is not None:

File e:\Anaconda File\Lib\site-packages\pyarrow\parquet\core.py:2631, in _ParquetDatasetV2.read(self, columns, use_threads, use_pandas_metadata) 2623 index_columns = [ 2624 col for col in _get_pandas_index_columns(metadata) 2625 if not isinstance(col, dict) 2626 ] 2627 columns = ( 2628 list(columns) + list(set(index_columns) - set(columns)) 2629 ) -> 2631 table = self._dataset.to_table( 2632 columns=columns, filter=self._filter_expression, 2633 use_threads=use_threads 2634 ) 2636 # if use_pandas_metadata, restore the pandas metadata (which gets 2637 # lost if doing a specific columns selection in to_table) 2638 if use_pandas_metadata:

File e:\Anaconda File\Lib\site-packages\pyarrow_dataset.pyx:556, in pyarrow._dataset.Dataset.to_table()

File e:\Anaconda File\Lib\site-packages\pyarrow_dataset.pyx:3713, in pyarrow._dataset.Scanner.to_table()

File e:\Anaconda File\Lib\site-packages\pyarrow\error.pxi:154, in pyarrow.lib.pyarrow_internal_check_status()

File e:\Anaconda File\Lib\site-packages\pyarrow\error.pxi:91, in pyarrow.lib.check_status()

ArrowNotImplementedError: Unsupported cast from int64 to null using function cast_null

Comment From: mroeschke

This sounds like a potentially upstream pyarrow issue so closing