DOC update Dask backend (#1411)

Co-authored-by: Loïc Estève <loic.esteve@ymail.com> Co-authored-by: Thomas Moreau <thomas.moreau.2010@gmail.com>
joblib · Apr 12, 2023 · 2d284d1 · 2d284d1
1 parent 5543c71
commit 2d284d1
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 25 deletions.
diff --git a/doc/parallel.rst b/doc/parallel.rst
@@ -78,9 +78,14 @@ choice with the :func:`~joblib.parallel_backend` context manager.
    Pyodide). In this case the loky backend is not available and the
    default backend falls back to threading.
 
-Besides builtin joblib backends, we can use
-`Joblib Apache Spark Backend <https://github.com/joblib/joblib-spark>`_
-to distribute joblib tasks on a Spark cluster.
+In addition to the builtin joblib backends, there are several cluster-specific
+backends you can use:
+
+* `Dask <https://docs.dask.org/en/stable/>`_ backend for Dask clusters
+  (see :ref:`sphx_glr_auto_examples_parallel_distributed_backend_simple.py` for an example),
+* `Ray <https://docs.ray.io/en/latest/index.html>`_ backend for Ray clusters,
+* `Joblib Apache Spark Backend <https://github.com/joblib/joblib-spark>`_
+  to distribute joblib tasks on a Spark cluster.
 
 .. _serialization_and_processes:
 

diff --git a/examples/parallel/distributed_backend_simple.py b/examples/parallel/distributed_backend_simple.py
@@ -1,32 +1,37 @@
 """
-Using dask distributed for single-machine parallel computing
-=============================================================
+Using Dask for single-machine parallel computing
+================================================
 
-This example shows the simplest usage of the dask `distributed
-<https://distributed.readthedocs.io>`__ backend, on the local computer.
+This example shows the simplest usage of the
+`Dask <https://docs.dask.org/en/stable/>`_
+backend on your local machine.
 
 This is useful for prototyping a solution, to later be run on a truly
-distributed cluster, as the only change to be made is the address of the
-scheduler.
+`distributed Dask cluster
+<https://docs.dask.org/en/stable/deploying.html#distributed-computing>`_,
+as the only change needed is the cluster class.
 
 Another realistic usage scenario: combining dask code with joblib code,
 for instance using dask for preprocessing data, and scikit-learn for
 machine learning. In such a setting, it may be interesting to use
 distributed as a backend scheduler for both dask and joblib, to
-orchestrate well the computation.
+orchestrate the computation.
 
 """
 
 ###############################################################################
 # Setup the distributed client
 ###############################################################################
-from dask.distributed import Client
+from dask.distributed import Client, LocalCluster
 
-# If you have a remote cluster running Dask
-# client = Client('tcp://scheduler-address:8786')
+# replace with whichever cluster class you're using
+# https://docs.dask.org/en/stable/deploying.html#distributed-computing
+cluster = LocalCluster()
+# connect client to your cluster
+client = Client(cluster)
 
-# If you want Dask to set itself up on your personal computer
-client = Client(processes=False)
+# Monitor your computation with the Dask dashboard
+print(client.dashboard_link)
 
 ###############################################################################
 # Run parallel computation using dask.distributed
@@ -37,18 +42,16 @@
 
 
 def long_running_function(i):
-    time.sleep(.1)
+    time.sleep(0.1)
     return i
 
 
 ###############################################################################
 # The verbose messages below show that the backend is indeed the
 # dask.distributed one
-with joblib.parallel_backend('dask'):
+with joblib.parallel_backend("dask"):
     joblib.Parallel(verbose=100)(
-        joblib.delayed(long_running_function)(i)
-        for i in range(10))
+        joblib.delayed(long_running_function)(i) for i in range(10)
+    )
 
 ###############################################################################
-# Progress in computation can be followed on the distributed web
-# interface, see https://dask.pydata.org/en/latest/diagnostics-distributed.html
diff --git a/joblib/parallel.py b/joblib/parallel.py
@@ -151,10 +151,21 @@ class parallel_backend(object):
     multiprocessing and loky may not be available, in which case joblib
     defaults to threading.
 
-    In addition, if the `dask` and `distributed` Python packages are installed,
-    it is possible to use the 'dask' backend for better scheduling of nested
-    parallel calls without over-subscription and potentially distribute
-    parallel calls over a networked cluster of several hosts.
+    You can also use the `Dask <https://docs.dask.org/en/stable/>`_ joblib
+    backend to distribute work across machines. This works well with
+    scikit-learn estimators with the ``n_jobs`` parameter, for example::
+
+    >>> import joblib  # doctest: +SKIP
+    >>> from sklearn.model_selection import GridSearchCV  # doctest: +SKIP
+    >>> from dask.distributed import Client, LocalCluster # doctest: +SKIP
+
+    >>> # create a local Dask cluster
+    >>> cluster = LocalCluster()  # doctest: +SKIP
+    >>> client = Client(cluster)  # doctest: +SKIP
+    >>> grid_search = GridSearchCV(estimator, param_grid, n_jobs=-1)
+    ... # doctest: +SKIP
+    >>> with joblib.parallel_backend("dask", scatter=[X, y]):  # doctest: +SKIP
+    ...     grid_search.fit(X, y)
 
     It is also possible to use the distributed 'ray' backend for distributing
     the workload to a cluster of nodes. To use the 'ray' joblib backend add