Force block partitions materialization

AndreyPavlenko · AndreyPavlenko · commit ca80fd118e61 · 2024-03-18T22:30:45.000+01:00
diff --git a/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/virtual_partition.py b/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/virtual_partition.py
@@ -88,38 +88,35 @@ def __init__(
         if not isinstance(data, Collection) or len(data) == 1:
             if not isinstance(data, Collection):
                 data = [data]
-            self._set_data_ref(data[0]._data_ref)
+            self._set_data_ref(data[0]._data)
             self._num_splits = 1
             self._list_of_block_partitions = data
             return
 
         self._num_splits = len(data)
         self._list_of_block_partitions = data
-        refs = [part._data_ref for part in self._list_of_block_partitions]
+        refs = [part._data_ref for part in data]
 
         if (
             isinstance(refs[0], _DeferredGetChunk)
+            and isinstance(split := refs[0].data, _DeferredSplit)
             and (refs[0].index == 0)
             and all(prev.is_next_chunk(next) for prev, next in zip(refs[:-1], refs[1:]))
         ):
-            self._chunk_lengths_cache = (
-                None
-                if any(chunk.length is None for chunk in refs)
-                else [chunk.length for chunk in refs]
-            )
+            if all(chunk.length is not None for chunk in refs):
+                self._chunk_lengths_cache = [chunk.length for chunk in refs]
 
-            split: _DeferredSplit = refs[0].split
-            if split.num_splits == refs[-1].index:
+            if split.num_splits == refs[-1].index + 1:
                 # All the partitions are the chunks of the same DataFrame. Concatenation of
                 # all these chunks will get a df identical to the original one. Thus, we
                 # don't need to concatenate but can get the original one instead.
-                self._set_data_ref(split.non_split)
+                self._set_data_ref(split.data)
                 return
 
             # TODO: We have a subset of the same frame here and can just get a single chunk
             # from the original frame instead of concatenating all these chunks.
 
-        self._set_data_ref(self._concat(refs))
+        self._set_data_ref(self._concat([part._data for part in data]))
 
     def _set_data_ref(
         self, data: Union[DeferredExecution, ObjectRefType]
@@ -165,16 +162,16 @@ def apply(
         if other_axis_partition is not None:
             if isinstance(other_axis_partition, Collection):
                 if len(other_axis_partition) == 1:
-                    other_part = other_axis_partition[0]._data_ref
+                    other_part = other_axis_partition[0]._data
                 else:
                     concat_fn = (
                         PandasOnRayDataframeColumnPartition
                         if self.axis
                         else PandasOnRayDataframeRowPartition
                     )._concat
-                    other_part = concat_fn([p._data_ref for p in other_axis_partition])
+                    other_part = concat_fn([p._data for p in other_axis_partition])
             else:
-                other_part = other_axis_partition._data_ref
+                other_part = other_axis_partition._data
             args = [other_part] + list(args)
 
         de = self._apply(func, args, kwargs)
@@ -224,10 +221,6 @@ def split(
     def _length_cache(self):  # noqa: GL08
         return self._meta[self._meta_offset]
 
-    @_length_cache.setter
-    def _length_cache(self, value):  # noqa: GL08
-        self._meta[self._meta_offset] = value
-
     def length(self, materialize=True):  # noqa: GL08
         if self._length_cache is None:
             self._calculate_lengths(materialize)
@@ -237,10 +230,6 @@ def length(self, materialize=True):  # noqa: GL08
     def _width_cache(self):  # noqa: GL08
         return self._meta[self._meta_offset + 1]
 
-    @_width_cache.setter
-    def _width_cache(self, value):  # noqa: GL08
-        self._meta[self._meta_offset + 1] = value
-
     def width(self, materialize=True):  # noqa: GL08
         if self._width_cache is None:
             self._calculate_lengths(materialize)
@@ -417,18 +406,17 @@ def split(
 class _DeferredSplit(DeferredExecution):  # noqa: GL08
     def __init__(
         self,
-        non_split: ObjectRefOrDeType,
+        obj: ObjectRefOrDeType,
         func: ObjectRefType,
         num_splits: int,
-        lengths: Optional[List[int]],
+        lengths: Union[List[int], None],
     ):
-        self.non_split = non_split
         self.num_splits = num_splits
         self.skip_chunks = set()
         args = [num_splits, MinPartitionSize.get(), self.skip_chunks]
         if lengths and (len(lengths) == num_splits):
             args.extend(lengths)
-        super().__init__(non_split, func, args, num_returns=num_splits)
+        super().__init__(obj, func, args, num_returns=num_splits)
 
 
 class _DeferredGetChunk(DeferredGetItem):  # noqa: GL08
@@ -439,13 +427,13 @@ def __init__(self, split: _DeferredSplit, index: int, length: Optional[int] = No
 
     def __del__(self):
         """Remove this chunk from _DeferredSplit if it's not executed yet."""
-        if self.data is self.split:
-            self.split.skip_chunks.add(self.index)
+        if isinstance(self.data, _DeferredSplit):
+            self.data.skip_chunks.add(self.index)
 
     def is_next_chunk(self, other):  # noqa: GL08
         return (
             isinstance(other, _DeferredGetChunk)
-            and (self.split is other.split)
+            and (self.data is other.data)
             and (other.index == self.index + 1)
         )