Pandas version checks
-
[X] I have checked that this issue has not already been reported.
-
[X] I have confirmed this bug exists on the latest version of pandas.
-
[X] I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
import pandas as pd
# Reading multiple .parquet file from directory
df = pd.read_parquet("directory")
Issue Description
assume we are going read multiple file from directory and format is .parquet , when try it If first file has column with None value error reported : "ArrowNotImplementedError: Unsupported cast from int64 to null using function cast_null" I change first file and replace it with file without None value column and this Error removed.
Expected Behavior
Not crashing
Installed Versions
Comment From: mroeschke
Thanks for the report, but we would need a reproducible example (using .to_parquet
& read_parquet
) in order to further debug this issue.
Comment From: Saeid696
Here is some code. hope be useful
import pandas as pd import os
making path for multi file
os.makedirs('folder/subfolder', exist_ok=True)
First .parquet file
df01 = pd.DataFrame({'name': 'saeid', 'last_name': 'Gorbaniyan', 'age': None, 'city': 'maragheh', 'gender': "M"}, index=[0]) df01.to_parquet('folder/subfolder/df01.parquet', engine='pyarrow')
Second .parquet file
df02 = pd.DataFrame({'name': 'saeid', 'last_name': 'Gorbaniyan', 'age': 28, 'city': 'maragheh', 'gender': "M"}, index=[0]) df02.to_parquet('folder/subfolder/df02.parquet', engine='pyarrow')
Reading from directory
df = pd.read_parquet('./folder/subfolder')
df
And here Error appeared
ArrowNotImplementedError Traceback (most recent call last) Cell In[13], line 1 ----> 1 df = pd.read_parquet('./folder/subfolder') 2 df
File e:\Anaconda File\Lib\site-packages\pandas\io\parquet.py:670, in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, filesystem, filters, kwargs) 667 use_nullable_dtypes = False 668 check_dtype_backend(dtype_backend) --> 670 return impl.read( 671 path, 672 columns=columns, 673 filters=filters, 674 storage_options=storage_options, 675 use_nullable_dtypes=use_nullable_dtypes, 676 dtype_backend=dtype_backend, 677 filesystem=filesystem, 678 kwargs, 679 )
File e:\Anaconda File\Lib\site-packages\pandas\io\parquet.py:272, in PyArrowImpl.read(self, path, columns, filters, use_nullable_dtypes, dtype_backend, storage_options, filesystem, kwargs) 265 path_or_handle, handles, filesystem = _get_path_or_handle( 266 path, 267 filesystem, 268 storage_options=storage_options, 269 mode="rb", 270 ) 271 try: --> 272 pa_table = self.api.parquet.read_table( 273 path_or_handle, 274 columns=columns, 275 filesystem=filesystem, 276 filters=filters, 277 kwargs, 278 ) 279 result = pa_table.to_pandas(**to_pandas_kwargs) 281 if manager == "array":
File e:\Anaconda File\Lib\site-packages\pyarrow\parquet\core.py:3003, in read_table(source, columns, use_threads, metadata, schema, use_pandas_metadata, read_dictionary, memory_map, buffer_size, partitioning, filesystem, filters, use_legacy_dataset, ignore_prefixes, pre_buffer, coerce_int96_timestamp_unit, decryption_properties, thrift_string_size_limit, thrift_container_size_limit) 2992 # TODO test that source is not a directory or a list 2993 dataset = ParquetFile( 2994 source, metadata=metadata, read_dictionary=read_dictionary, 2995 memory_map=memory_map, buffer_size=buffer_size, (...) 3000 thrift_container_size_limit=thrift_container_size_limit, 3001 ) -> 3003 return dataset.read(columns=columns, use_threads=use_threads, 3004 use_pandas_metadata=use_pandas_metadata) 3006 warnings.warn( 3007 "Passing 'use_legacy_dataset=True' to get the legacy behaviour is " 3008 "deprecated as of pyarrow 8.0.0, and the legacy implementation will " 3009 "be removed in a future version.", 3010 FutureWarning, stacklevel=2) 3012 if ignore_prefixes is not None:
File e:\Anaconda File\Lib\site-packages\pyarrow\parquet\core.py:2631, in _ParquetDatasetV2.read(self, columns, use_threads, use_pandas_metadata)
2623 index_columns = [
2624 col for col in _get_pandas_index_columns(metadata)
2625 if not isinstance(col, dict)
2626 ]
2627 columns = (
2628 list(columns) + list(set(index_columns) - set(columns))
2629 )
-> 2631 table = self._dataset.to_table(
2632 columns=columns, filter=self._filter_expression,
2633 use_threads=use_threads
2634 )
2636 # if use_pandas_metadata, restore the pandas metadata (which gets
2637 # lost if doing a specific columns
selection in to_table)
2638 if use_pandas_metadata:
File e:\Anaconda File\Lib\site-packages\pyarrow_dataset.pyx:556, in pyarrow._dataset.Dataset.to_table()
File e:\Anaconda File\Lib\site-packages\pyarrow_dataset.pyx:3713, in pyarrow._dataset.Scanner.to_table()
File e:\Anaconda File\Lib\site-packages\pyarrow\error.pxi:154, in pyarrow.lib.pyarrow_internal_check_status()
File e:\Anaconda File\Lib\site-packages\pyarrow\error.pxi:91, in pyarrow.lib.check_status()
ArrowNotImplementedError: Unsupported cast from int64 to null using function cast_null
Comment From: Saeid696
Here is some code. hope be useful
import pandas as pd import os
making path for multi file
os.makedirs('folder/subfolder', exist_ok=True)
First .parquet file
df01 = pd.DataFrame({'name': 'saeid', 'last_name': 'Gorbaniyan', 'age': None, 'city': 'maragheh', 'gender': "M"}, index=[0]) df01.to_parquet('folder/subfolder/df01.parquet', engine='pyarrow')
Second .parquet file
df02 = pd.DataFrame({'name': 'saeid', 'last_name': 'Gorbaniyan', 'age': 28, 'city': 'maragheh', 'gender': "M"}, index=[0]) df02.to_parquet('folder/subfolder/df02.parquet', engine='pyarrow')
Reading from directory
df = pd.read_parquet('./folder/subfolder')
df
And here Error appeared
ArrowNotImplementedError Traceback (most recent call last) Cell In[13], line 1 ----> 1 df = pd.read_parquet('./folder/subfolder') 2 df
File e:\Anaconda File\Lib\site-packages\pandas\io\parquet.py:670, in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, filesystem, filters, kwargs) 667 use_nullable_dtypes = False 668 check_dtype_backend(dtype_backend) --> 670 return impl.read( 671 path, 672 columns=columns, 673 filters=filters, 674 storage_options=storage_options, 675 use_nullable_dtypes=use_nullable_dtypes, 676 dtype_backend=dtype_backend, 677 filesystem=filesystem, 678 kwargs, 679 )
File e:\Anaconda File\Lib\site-packages\pandas\io\parquet.py:272, in PyArrowImpl.read(self, path, columns, filters, use_nullable_dtypes, dtype_backend, storage_options, filesystem, kwargs) 265 path_or_handle, handles, filesystem = _get_path_or_handle( 266 path, 267 filesystem, 268 storage_options=storage_options, 269 mode="rb", 270 ) 271 try: --> 272 pa_table = self.api.parquet.read_table( 273 path_or_handle, 274 columns=columns, 275 filesystem=filesystem, 276 filters=filters, 277 kwargs, 278 ) 279 result = pa_table.to_pandas(**to_pandas_kwargs) 281 if manager == "array":
File e:\Anaconda File\Lib\site-packages\pyarrow\parquet\core.py:3003, in read_table(source, columns, use_threads, metadata, schema, use_pandas_metadata, read_dictionary, memory_map, buffer_size, partitioning, filesystem, filters, use_legacy_dataset, ignore_prefixes, pre_buffer, coerce_int96_timestamp_unit, decryption_properties, thrift_string_size_limit, thrift_container_size_limit) 2992 # TODO test that source is not a directory or a list 2993 dataset = ParquetFile( 2994 source, metadata=metadata, read_dictionary=read_dictionary, 2995 memory_map=memory_map, buffer_size=buffer_size, (...) 3000 thrift_container_size_limit=thrift_container_size_limit, 3001 ) -> 3003 return dataset.read(columns=columns, use_threads=use_threads, 3004 use_pandas_metadata=use_pandas_metadata) 3006 warnings.warn( 3007 "Passing 'use_legacy_dataset=True' to get the legacy behaviour is " 3008 "deprecated as of pyarrow 8.0.0, and the legacy implementation will " 3009 "be removed in a future version.", 3010 FutureWarning, stacklevel=2) 3012 if ignore_prefixes is not None:
File e:\Anaconda File\Lib\site-packages\pyarrow\parquet\core.py:2631, in _ParquetDatasetV2.read(self, columns, use_threads, use_pandas_metadata)
2623 index_columns = [
2624 col for col in _get_pandas_index_columns(metadata)
2625 if not isinstance(col, dict)
2626 ]
2627 columns = (
2628 list(columns) + list(set(index_columns) - set(columns))
2629 )
-> 2631 table = self._dataset.to_table(
2632 columns=columns, filter=self._filter_expression,
2633 use_threads=use_threads
2634 )
2636 # if use_pandas_metadata, restore the pandas metadata (which gets
2637 # lost if doing a specific columns
selection in to_table)
2638 if use_pandas_metadata:
File e:\Anaconda File\Lib\site-packages\pyarrow_dataset.pyx:556, in pyarrow._dataset.Dataset.to_table()
File e:\Anaconda File\Lib\site-packages\pyarrow_dataset.pyx:3713, in pyarrow._dataset.Scanner.to_table()
File e:\Anaconda File\Lib\site-packages\pyarrow\error.pxi:154, in pyarrow.lib.pyarrow_internal_check_status()
File e:\Anaconda File\Lib\site-packages\pyarrow\error.pxi:91, in pyarrow.lib.check_status()
ArrowNotImplementedError: Unsupported cast from int64 to null using function cast_null
Comment From: mroeschke
This sounds like a potentially upstream pyarrow issue so closing