modin-project
diff --git a/‎.github/actions/mamba-env/action.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/actions/mamba-env/action.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.github/actions/run-core-tests/group_2/action.yml‎
Lines changed: 0 additions & 2 deletions b/‎.github/actions/run-core-tests/group_2/action.yml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎.github/actions/run-core-tests/group_3/action.yml‎
Lines changed: 6 additions & 1 deletion b/‎.github/actions/run-core-tests/group_3/action.yml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/ci.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/push-to-master.yml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/push-to-master.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎asv_bench/benchmarks/utils/common.py‎
Lines changed: 13 additions & 45 deletions b/‎asv_bench/benchmarks/utils/common.py‎
Lines changed: 13 additions & 45 deletions
diff --git a/‎docs/development/contributing.rst‎
Lines changed: 2 additions & 2 deletions b/‎docs/development/contributing.rst‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/ecosystem.rst‎
Lines changed: 27 additions & 0 deletions b/‎docs/ecosystem.rst‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎docs/flow/modin/config.rst‎
Lines changed: 27 additions & 0 deletions b/‎docs/flow/modin/config.rst‎
Lines changed: 27 additions & 0 deletions
@@ -42,3 +42,7 @@ runs:
         # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed
         # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264
         use-only-tar-bz2: false
+    - shell: bash -l {0}
+      run: |
+        conda run -n ${{ inputs.activate-environment }} pip install .
+        conda list -n ${{ inputs.activate-environment }}
@@ -20,5 +20,3 @@ runs:
                                                       modin/pandas/test/dataframe/test_pickle.py
           echo "::endgroup::"
         shell: bash -l {0}
-      - run: MODIN_RANGE_PARTITIONING=1 python -m pytest modin/pandas/test/dataframe/test_join_sort.py -k "merge"
-        shell: bash -l {0}
@@ -18,7 +18,12 @@ runs:
           echo "::endgroup::"
         shell: bash -l {0}
       - run: |
-          echo "::group::Running experimental groupby tests (group 3)..."
+          echo "::group::Running range-partitioning tests (group 3)..."
           MODIN_RANGE_PARTITIONING_GROUPBY=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_groupby.py
+          MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_series.py -k "test_unique or test_nunique or drop_duplicates or test_resample"
+          MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_general.py -k "test_unique"
+          MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/dataframe/test_map_metadata.py -k "drop_duplicates"
+          MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/dataframe/test_join_sort.py -k "merge"
+          MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/dataframe/test_default.py -k "resample"
           echo "::endgroup::"
         shell: bash -l {0}
@@ -188,7 +188,6 @@ jobs:
       - run: python -m pytest modin/pandas/test/dataframe/test_binary.py
       - run: python -m pytest modin/pandas/test/dataframe/test_reduce.py
       - run: python -m pytest modin/pandas/test/dataframe/test_join_sort.py
-      - run: MODIN_RANGE_PARTITIONING=1 python -m pytest modin/pandas/test/dataframe/test_join_sort.py -k "merge"
       - run: python -m pytest modin/pandas/test/test_general.py
       - run: python -m pytest modin/pandas/test/dataframe/test_indexing.py
       - run: python -m pytest modin/pandas/test/test_series.py
 
@@ -46,7 +46,6 @@ jobs:
           python -m pytest modin/pandas/test/dataframe/test_indexing.py
           python -m pytest modin/pandas/test/dataframe/test_iter.py
           python -m pytest modin/pandas/test/dataframe/test_join_sort.py
-          MODIN_RANGE_PARTITIONING=1 python -m pytest modin/pandas/test/dataframe/test_join_sort.py -k "merge"
           python -m pytest modin/pandas/test/dataframe/test_map_metadata.py
           python -m pytest modin/pandas/test/dataframe/test_reduce.py
           python -m pytest modin/pandas/test/dataframe/test_udf.py
 
@@ -24,6 +24,8 @@ Modin is a drop-in replacement for [pandas](https://github.com/pandas-dev/pandas
 single-threaded, Modin lets you instantly speed up your workflows by scaling pandas so it uses all of your
 cores. Modin works especially well on larger datasets, where pandas becomes painfully slow or runs
 [out of memory](https://modin.readthedocs.io/en/latest/getting_started/why_modin/out_of_core.html).
+Also, Modin comes with the [additional APIs](https://modin.readthedocs.io/en/latest/usage_guide/advanced_usage/index.html#additional-apis)
+to improve user experience.
 
 By simply replacing the import statement, Modin offers users effortless speed and scale for their pandas workflows:
 
 
@@ -114,11 +114,7 @@ def gen_nan_data(nrows: int, ncols: int) -> dict:
 
 def gen_int_data(nrows: int, ncols: int, rand_low: int, rand_high: int) -> dict:
     """
-    Generate int data with caching.
-
-    The generated data are saved in the dictionary and on a subsequent call,
-    if the keys match, saved data will be returned. Therefore, we need
-    to carefully monitor the changing of saved data and make its copy if needed.
+    Generate int data.
 
     Parameters
     ----------
@@ -136,30 +132,16 @@ def gen_int_data(nrows: int, ncols: int, rand_low: int, rand_high: int) -> dict:
     dict
         Number of keys - `ncols`, each of them store np.ndarray of `nrows` length.
     """
-    cache_key = ("int", nrows, ncols, rand_low, rand_high)
-    if cache_key in data_cache:
-        return data_cache[cache_key]
-
-    logging.info(
-        "Generating int data {} rows and {} columns [{}-{}]".format(
-            nrows, ncols, rand_low, rand_high
-        )
-    )
     data = {
         "col{}".format(i): np.random.randint(rand_low, rand_high, size=(nrows))
         for i in range(ncols)
     }
-    data_cache[cache_key] = weakdict(data)
     return data
 
 
 def gen_str_int_data(nrows: int, ncols: int, rand_low: int, rand_high: int) -> dict:
     """
-    Generate int data and string data with caching.
-
-    The generated data are saved in the dictionary and on a subsequent call,
-    if the keys match, saved data will be returned. Therefore, we need
-    to carefully monitor the changing of saved data and make its copy if needed.
+    Generate int data and string data.
 
     Parameters
     ----------
@@ -178,30 +160,16 @@ def gen_str_int_data(nrows: int, ncols: int, rand_low: int, rand_high: int) -> d
         Number of keys - `ncols`, each of them store np.ndarray of `nrows` length.
         One of the columns with string values.
     """
-    cache_key = ("str_int", nrows, ncols, rand_low, rand_high)
-    if cache_key in data_cache:
-        return data_cache[cache_key]
-
-    logging.info(
-        "Generating str_int data {} rows and {} columns [{}-{}]".format(
-            nrows, ncols, rand_low, rand_high
-        )
-    )
     data = gen_int_data(nrows, ncols, rand_low, rand_high).copy()
     # convert values in arbitary column to string type
     key = list(data.keys())[0]
     data[key] = [f"str_{x}" for x in data[key]]
-    data_cache[cache_key] = weakdict(data)
     return data
 
 
 def gen_true_false_int_data(nrows, ncols, rand_low, rand_high):
     """
-    Generate int data and string data "true" and "false" values with caching.
-
-    The generated data are saved in the dictionary and on a subsequent call,
-    if the keys match, saved data will be returned. Therefore, we need
-    to carefully monitor the changing of saved data and make its copy if needed.
+    Generate int data and string data "true" and "false" values.
 
     Parameters
     ----------
@@ -221,15 +189,6 @@ def gen_true_false_int_data(nrows, ncols, rand_low, rand_high):
         One half of the columns with integer values, another half - with "true" and
         "false" string values.
     """
-    cache_key = ("true_false_int", nrows, ncols, rand_low, rand_high)
-    if cache_key in data_cache:
-        return data_cache[cache_key]
-
-    logging.info(
-        "Generating true_false_int data {} rows and {} columns [{}-{}]".format(
-            nrows, ncols, rand_low, rand_high
-        )
-    )
     data = gen_int_data(nrows // 2, ncols // 2, rand_low, rand_high)
 
     data_true_false = {
@@ -239,7 +198,6 @@ def gen_true_false_int_data(nrows, ncols, rand_low, rand_high):
         for i in range(ncols - ncols // 2)
     }
     data.update(data_true_false)
-    data_cache[cache_key] = weakdict(data)
     return data
 
 
@@ -289,10 +247,20 @@ def gen_data(
         "str_int": gen_str_int_data,
         "true_false_int": gen_true_false_int_data,
     }
+    cache_key = (data_type, nrows, ncols, rand_low, rand_high)
+    if cache_key in data_cache:
+        return data_cache[cache_key]
+
+    logging.info(
+        "Generating {} data {} rows and {} columns [{}-{}]".format(
+            data_type, nrows, ncols, rand_low, rand_high
+        )
+    )
     assert data_type in type_to_generator
     data_generator = type_to_generator[data_type]
 
     data = data_generator(nrows, ncols, rand_low, rand_high)
+    data_cache[cache_key] = weakdict(data)
 
     return data
 
 
@@ -63,8 +63,8 @@ or ``--signoff`` to your usual ``git commit`` commands:
 
 .. code-block:: bash
 
-   git commit --signoff
-   git commit -s
+   git commit --signoff -m "This is my commit message"
+   git commit -s -m "This is my commit message"
 
 This will use your default git configuration which is found in .git/config. To change
 this, you can use the following commands:
 
@@ -45,5 +45,32 @@ where NumPy can be used and what libraries it powers.
 
     numpy_arr = to_numpy(modin_df)
 
+to_ray
+------
+
+You can refer to `Ray Data`_ page to get more details on
+where Ray Dataset can be used and what libraries it powers.
+
+.. code-block:: python
+
+    from modin.pandas.io import to_ray
+
+    ray_dataset = to_ray(modin_df)
+
+to_dask
+-------
+
+You can refer to `Dask DataFrame`_ page to get more details on
+where Dask DataFrame can be used and what libraries it powers.
+
+.. code-block:: python
+
+    from modin.pandas.io import to_dask
+
+    dask_df = to_dask(modin_df)
+
 .. _pandas ecosystem: https://pandas.pydata.org/community/ecosystem.html
 .. _NumPy ecosystem: https://numpy.org
+.. _Ray Data: https://docs.ray.io/en/latest/data/data.html
+.. _Dask DataFrame: https://docs.dask.org/en/stable/dataframe.html
+
@@ -56,3 +56,30 @@ API.
     # Changing value of `NPartitions`
     modin.config.NPartitions.put(16)
     print(modin.config.NPartitions.get()) # prints '16'
+
+One can also use config variables with a context manager in order to use
+some config only for a certain part of the code:
+
+.. code-block:: python
+
+    import modin.config as cfg
+
+    # Default value for this config is 'False'
+    print(cfg.RangePartitioning.get()) # False
+
+    # Set the config to 'True' inside of the context-manager
+    with cfg.context(RangePartitioning=True):
+        print(cfg.RangePartitioning.get()) # True
+        df.merge(...) # will use range-partitioning impl
+
+    # Once the context is over, the config gets back to its previous value
+    print(cfg.RangePartitioning.get()) # False
+
+    # You can also set multiple config at once when you pass a dictionary to 'cfg.context'
+    print(cfg.AsyncReadMode.get()) # False
+
+    with cfg.context(RangePartitioning=True, AsyncReadMode=True):
+        print(cfg.RangePartitioning.get()) # True
+        print(cfg.AsyncReadMode.get()) # True
+    print(cfg.RangePartitioning.get()) # False
+    print(cfg.AsyncReadMode.get()) # False