diff --git a/.gitignore b/.gitignore
index e4e90f2fc0b..5459b8631b0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -73,11 +73,8 @@ instance/
 
 # Sphinx documentation
 docs/_build/
-# automatically generated content
-docs/source/modules/auto_generated/
-docs/source/api_reference/modules/auto_generated/
-
-# automatically sym-linked example notebooks
+docs/source/api_reference/auto_generated/
+docs/estimator_overview_table.md
 docs/source/examples
 
 # PyBuilder
diff --git a/.readthedocs.yml b/.readthedocs.yml
index 7d9d25e8037..0692fbe32e4 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -18,3 +18,4 @@ python:
 
 sphinx:
   configuration: docs/source/conf.py
+#  fail_on_warning: True
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 0323318b859..9fb15437f5c 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -310,8 +310,7 @@ Changed
 
 Fixed
 ~~~~~
-* Fix links in Readthedocs and Binder launch button (#416)
-@mloning
+* Fix links in Readthedocs and Binder launch button (#416) @mloning
 * Fixed small bug in performance metrics (#422) @krumeto
 * Resolved warnings in notebook examples (#418) @alwinw
 * Resolves #325 ModuleNotFoundError for soft dependencies (#410) @alwinw
diff --git a/docs/Makefile b/docs/Makefile
index c7946fbd17d..957e8aa0894 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -3,7 +3,7 @@
 # You can set these variables from the command line.
 PREPROCESS    = sphinx-apidoc
 APIDOCTEMPLATEDIR = source/_templates/apidoc
-AUTOGENDIR    = source/modules/auto_generated
+AUTOGENDIR    = source/api_reference/auto_generated
 SPHINXOPTS    =
 SPHINXBUILD   = sphinx-build
 SOURCEDIR     = source
diff --git a/docs/estimator_overview_table.md b/docs/estimator_overview_table.md
deleted file mode 100644
index 7aa1545d2ef..00000000000
--- a/docs/estimator_overview_table.md
+++ /dev/null
@@ -1,112 +0,0 @@
-| Class Name                                                                                                                                                                                              | Estimator Type                           | Authors                                                                    |
-|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------|:---------------------------------------------------------------------------|
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.arima.ARIMA.html">ARIMA</a>                                                                           | forecasting                              | Markus Löning & Hongyi Yang                                                |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.dists_kernels.compose_tab_to_panel.AggrDist.html">AggrDist</a>                                                    | dists_kernels                            | fkiraly                                                                    |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.kernel_based.Arsenal.html">Arsenal</a>                                                             | classification::kernel_based             | Matthew Middlehurst & Oleksii Kachaiev                                     |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.arima.AutoARIMA.html">AutoARIMA</a>                                                                   | forecasting                              | Markus Löning & Hongyi Yang                                                |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.series.acf.AutoCorrelationTransformer.html">AutoCorrelationTransformer</a>                        | transformations::series                  | Afzal Ansari                                                               |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.ets.AutoETS.html">AutoETS</a>                                                                         | forecasting                              | Hongyi Yang                                                                |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.bats.BATS.html">BATS</a>                                                                              | forecasting                              | Martin Walter                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.dictionary_based.BOSSEnsemble.html">BOSSEnsemble</a>                                               | classification::dictionary_based         | Matthew Middlehurst                                                        |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.series.boxcox.BoxCoxTransformer.html">BoxCoxTransformer</a>                                       | transformations::series                  | Markus Löning                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.interval_based.CanonicalIntervalForest.html">CanonicalIntervalForest</a>                           | classification::interval_based           | Matthew Middlehurst                                                        |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.catch22.Catch22.html">Catch22</a>                                                           | transformations::panel                   | Matthew Middlehurst                                                        |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.hybrid.Catch22ForestClassifier.html">Catch22ForestClassifier</a>                                   | classification::hybrid                   | Matthew Middlehurst                                                        |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.compose.ColumnConcatenator.html">ColumnConcatenator</a>                                     | transformations::panel                   | Markus Löning & Sajay Ganesh                                               |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.compose.ColumnEnsembleClassifier.html">ColumnEnsembleClassifier</a>                                | classification::compose                  | Aaron Bostrom                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.compose.ColumnTransformer.html">ColumnTransformer</a>                                       | transformations::panel                   | Markus Löning & Sajay Ganesh                                               |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.compose.ComposableTimeSeriesForestClassifier.html">ComposableTimeSeriesForestClassifier</a>        | classification::compose                  | Markus Löning & Ayushmaan Seth                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.regression.compose.ComposableTimeSeriesForestRegressor.html">ComposableTimeSeriesForestRegressor</a>              | regression::compose                      | Markus Löning & Ayushmaan Seth                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.series.detrend.ConditionalDeseasonalizer.html">ConditionalDeseasonalizer</a>                      | transformations::series::detrend         | Markus Löning                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.dictionary_based.ContractableBOSS.html">ContractableBOSS</a>                                       | classification::dictionary_based         | Matthew Middlehurst                                                        |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.shapelets.ContractedShapeletTransform.html">ContractedShapeletTransform</a>                 | transformations::panel                   | Jason Lines & David Guijo                                                  |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.series.cos.CosineTransformer.html">CosineTransformer</a>                                          | transformations::series                  | Afzal Ansari                                                               |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.croston.Croston.html">Croston</a>                                                                     | forecasting                              | no author info                                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.dwt.DWTTransformer.html">DWTTransformer</a>                                                 | transformations::panel                   | Vincent Nicholson                                                          |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.summarize.DerivativeSlopeTransformer.html">DerivativeSlopeTransformer</a>                   | transformations::panel::summarize        | no author info                                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.series.detrend.Deseasonalizer.html">Deseasonalizer</a>                                            | transformations::series::detrend         | Markus Löning                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.series.detrend.Detrender.html">Detrender</a>                                                      | transformations::series::detrend         | Markus Löning & Svea Meyer                                                 |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.compose.DirRecTabularRegressionForecaster.html">DirRecTabularRegressionForecaster</a>                 | forecasting::compose                     | Ayushmaan Seth, Kavin Anand, Luis Zugasti, Lovkush Agarwal & Markus Löning |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.compose.DirRecTimeSeriesRegressionForecaster.html">DirRecTimeSeriesRegressionForecaster</a>           | forecasting::compose                     | Ayushmaan Seth, Kavin Anand, Luis Zugasti, Lovkush Agarwal & Markus Löning |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.compose.DirectTabularRegressionForecaster.html">DirectTabularRegressionForecaster</a>                 | forecasting::compose                     | Ayushmaan Seth, Kavin Anand, Luis Zugasti, Lovkush Agarwal & Markus Löning |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.compose.DirectTimeSeriesRegressionForecaster.html">DirectTimeSeriesRegressionForecaster</a>           | forecasting::compose                     | Ayushmaan Seth, Kavin Anand, Luis Zugasti, Lovkush Agarwal & Markus Löning |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.interval_based.DrCIF.html">DrCIF</a>                                                               | classification::interval_based           | Matthew Middlehurst                                                        |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.distance_based.ElasticEnsemble.html">ElasticEnsemble</a>                                           | classification::distance_based           | Jason Lines                                                                |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.compose.EnsembleForecaster.html">EnsembleForecaster</a>                                               | forecasting::compose                     | Markus Löning                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.exp_smoothing.ExponentialSmoothing.html">ExponentialSmoothing</a>                                     | forecasting                              | Markus Löning & @big-o                                                     |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.series_as_features.compose.FeatureUnion.html">FeatureUnion</a>                                                    | series_as_features::compose              | Markus Löning                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.summarize.FittedParamExtractor.html">FittedParamExtractor</a>                               | transformations::panel::summarize        | no author info                                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.model_selection.ForecastingGridSearchCV.html">ForecastingGridSearchCV</a>                             | forecasting::model_selection             | Markus Löning                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.compose.ForecastingPipeline.html">ForecastingPipeline</a>                                             | forecasting::compose                     | Markus Löning & Martin Walter                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.model_selection.ForecastingRandomizedSearchCV.html">ForecastingRandomizedSearchCV</a>                 | forecasting::model_selection             | Markus Löning                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.hcrystalball.HCrystalBallForecaster.html">HCrystalBallForecaster</a>                                  | forecasting                              | no author info                                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.hybrid.HIVECOTEV1.html">HIVECOTEV1</a>                                                             | classification::hybrid                   | Matthew Middlehurst                                                        |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.hog1d.HOG1DTransformer.html">HOG1DTransformer</a>                                           | transformations::panel                   | no author info                                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.series.outlier_detection.HampelFilter.html">HampelFilter</a>                                      | transformations::series                  | Martin Walter                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.series.impute.Imputer.html">Imputer</a>                                                           | transformations::series                  | Martin Walter                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.dictionary_based.IndividualBOSS.html">IndividualBOSS</a>                                           | classification::dictionary_based         | Matthew Middlehurst                                                        |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.dictionary_based.IndividualTDE.html">IndividualTDE</a>                                             | classification::dictionary_based         | Matthew Middlehurst                                                        |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.segment.IntervalSegmenter.html">IntervalSegmenter</a>                                       | transformations::panel                   | no author info                                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.distance_based.KNeighborsTimeSeriesClassifier.html">KNeighborsTimeSeriesClassifier</a>             | classification::distance_based           | Jason Lines & TonyBagnall                                                  |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.series.boxcox.LogTransformer.html">LogTransformer</a>                                             | transformations::series                  | Markus Löning                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.dictionary_based.MUSE.html">MUSE</a>                                                               | classification::dictionary_based         | Patrick Schäfer                                                            |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.matrix_profile.MatrixProfile.html">MatrixProfile</a>                                        | transformations::panel                   | no author info                                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.series.matrix_profile.MatrixProfileTransformer.html">MatrixProfileTransformer</a>                 | transformations::series                  | Markus Löning                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.series.summarize.MeanTransformer.html">MeanTransformer</a>                                        | transformations::series                  | Markus Löning                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.rocket.MiniRocket.html">MiniRocket</a>                                                      | transformations::panel::rocket           | Angus Dempster                                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.rocket.MiniRocketMultivariate.html">MiniRocketMultivariate</a>                              | transformations::panel::rocket           | Angus Dempster                                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.shapelet_based.mrseql.mrseql.MrSEQLClassifier.html">MrSEQLClassifier</a>                           | classification::shapelet_based::mrseql   | Thach Le Nguyen                                                            |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.compose.MultioutputTabularRegressionForecaster.html">MultioutputTabularRegressionForecaster</a>       | forecasting::compose                     | Ayushmaan Seth, Kavin Anand, Luis Zugasti, Lovkush Agarwal & Markus Löning |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.compose.MultioutputTimeSeriesRegressionForecaster.html">MultioutputTimeSeriesRegressionForecaster</a> | forecasting::compose                     | Ayushmaan Seth, Kavin Anand, Luis Zugasti, Lovkush Agarwal & Markus Löning |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.compose.MultiplexForecaster.html">MultiplexForecaster</a>                                             | forecasting::compose                     | Kutay Koralturk                                                            |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.naive.NaiveForecaster.html">NaiveForecaster</a>                                                       | forecasting                              | Markus Löning & Piyush Gade                                                |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.online_learning.OnlineEnsembleForecaster.html">OnlineEnsembleForecaster</a>                           | forecasting::online_learning             | no author info                                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.series.compose.OptionalPassthrough.html">OptionalPassthrough</a>                                  | transformations::series                  | Martin Walter                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.dictionary_based.PAA.html">PAA</a>                                                          | transformations::panel::dictionary_based | Matthew Middlehurst                                                        |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.pca.PCATransformer.html">PCATransformer</a>                                                 | transformations::panel                   | Patrick Rockenschaub                                                       |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.padder.PaddingTransformer.html">PaddingTransformer</a>                                      | transformations::panel                   | Aaron Bostrom                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.series.acf.PartialAutoCorrelationTransformer.html">PartialAutoCorrelationTransformer</a>          | transformations::series                  | Afzal Ansari                                                               |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.summarize.PlateauFinder.html">PlateauFinder</a>                                             | transformations::panel::summarize        | no author info                                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.trend.PolynomialTrendForecaster.html">PolynomialTrendForecaster</a>                                   | forecasting                              | Markus Löning                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.fbprophet.Prophet.html">Prophet</a>                                                                   | forecasting                              | Martin Walter                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.distance_based.ProximityForest.html">ProximityForest</a>                                           | classification::distance_based           | George Oastler                                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.distance_based.ProximityStump.html">ProximityStump</a>                                             | classification::distance_based           | George Oastler (linkedin.com/goastler; github.com/goastler)                |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.distance_based.ProximityTree.html">ProximityTree</a>                                               | classification::distance_based           | George Oastler                                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.annotation.adapters.PyODAnnotator.html">PyODAnnotator</a>                                                         | annotation::adapters                     | mloning, satya-pattnaik & fkiraly                                          |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.kernel_based.ROCKETClassifier.html">ROCKETClassifier</a>                                           | classification::kernel_based             | Matthew Middlehurst                                                        |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.summarize.RandomIntervalFeatureExtractor.html">RandomIntervalFeatureExtractor</a>           | transformations::panel::summarize        | no author info                                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.segment.RandomIntervalSegmenter.html">RandomIntervalSegmenter</a>                           | transformations::panel                   | no author info                                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.interval_based.RandomIntervalSpectralForest.html">RandomIntervalSpectralForest</a>                 | classification::interval_based           | Tony Bagnall & Yi-Xuan Xu                                                  |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.compose.RecursiveTabularRegressionForecaster.html">RecursiveTabularRegressionForecaster</a>           | forecasting::compose                     | Ayushmaan Seth, Kavin Anand, Luis Zugasti, Lovkush Agarwal & Markus Löning |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.compose.RecursiveTimeSeriesRegressionForecaster.html">RecursiveTimeSeriesRegressionForecaster</a>     | forecasting::compose                     | Ayushmaan Seth, Kavin Anand, Luis Zugasti, Lovkush Agarwal & Markus Löning |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.rocket.Rocket.html">Rocket</a>                                                              | transformations::panel::rocket           | Angus Dempster                                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.dictionary_based.SAX.html">SAX</a>                                                          | transformations::panel::dictionary_based | Matthew Middlehurst                                                        |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.dictionary_based.SFA.html">SFA</a>                                                          | transformations::panel::dictionary_based | Matthew Middlehurst & Patrick Schäfer                                      |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.dists_kernels.scipy_dist.ScipyDist.html">ScipyDist</a>                                                            | dists_kernels                            | fkiraly                                                                    |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.compose.SeriesToPrimitivesRowTransformer.html">SeriesToPrimitivesRowTransformer</a>         | transformations::panel                   | Markus Löning & Sajay Ganesh                                               |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.compose.SeriesToSeriesRowTransformer.html">SeriesToSeriesRowTransformer</a>                 | transformations::panel                   | Markus Löning & Sajay Ganesh                                               |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.distance_based.ShapeDTW.html">ShapeDTW</a>                                                         | classification::distance_based           | Vincent Nicholson                                                          |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.shapelets.ShapeletTransform.html">ShapeletTransform</a>                                     | transformations::panel                   | Jason Lines & David Guijo                                                  |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.shapelet_based.ShapeletTransformClassifier.html">ShapeletTransformClassifier</a>                   | classification::shapelet_based           | Tony Bagnall                                                               |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.signature_based.SignatureClassifier.html">SignatureClassifier</a>                                  | classification::signature_based          | no author info                                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.signature_based.SignatureTransformer.html">SignatureTransformer</a>                         | transformations::panel::signature_based  | no author info                                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.segment.SlidingWindowSegmenter.html">SlidingWindowSegmenter</a>                             | transformations::panel                   | no author info                                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.slope.SlopeTransformer.html">SlopeTransformer</a>                                           | transformations::panel                   | no author info                                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.compose.StackingForecaster.html">StackingForecaster</a>                                               | forecasting::compose                     | Markus Löning                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.interval_based.SupervisedTimeSeriesForest.html">SupervisedTimeSeriesForest</a>                     | classification::interval_based           | Matthew Middlehurst                                                        |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.tbats.TBATS.html">TBATS</a>                                                                           | forecasting                              | Martin Walter                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.tsfresh.TSFreshFeatureExtractor.html">TSFreshFeatureExtractor</a>                           | transformations::panel                   | Ayushmaan Seth, Markus Löning & Alwin Wang                                 |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.tsfresh.TSFreshRelevantFeatureExtractor.html">TSFreshRelevantFeatureExtractor</a>           | transformations::panel                   | Ayushmaan Seth, Markus Löning & Alwin Wang                                 |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.interpolate.TSInterpolator.html">TSInterpolator</a>                                         | transformations::panel                   | no author info                                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.series.adapt.TabularToSeriesAdaptor.html">TabularToSeriesAdaptor</a>                              | transformations::series                  | Markus Löning                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.reduce.Tabularizer.html">Tabularizer</a>                                                    | transformations::panel                   | Markus Löning                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.dictionary_based.TemporalDictionaryEnsemble.html">TemporalDictionaryEnsemble</a>                   | classification::dictionary_based         | Matthew Middlehurst                                                        |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.theta.ThetaForecaster.html">ThetaForecaster</a>                                                       | forecasting                              | @big-o & Markus Löning                                                     |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.series.theta.ThetaLinesTransformer.html">ThetaLinesTransformer</a>                                | transformations::series                  | Guzal Bulatova & Markus Löning                                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.interval_based.TimeSeriesForestClassifier.html">TimeSeriesForestClassifier</a>                     | classification::interval_based           | Tony Bagnall, kkoziara, luiszugasti & kanand77                             |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.regression.interval_based.TimeSeriesForestRegressor.html">TimeSeriesForestRegressor</a>                           | regression::interval_based               | Tony Bagnall, kkoziara, luiszugasti, kanand77 & Markus Löning              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.clustering.TimeSeriesKMeans.html">TimeSeriesKMeans</a>                                                            | clustering                               | Christopher Holder & Tony Bagnall                                          |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.clustering.TimeSeriesKMedoids.html">TimeSeriesKMedoids</a>                                                        | clustering                               | Christopher Holder & Tony Bagnall                                          |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.forecasting.compose.TransformedTargetForecaster.html">TransformedTargetForecaster</a>                             | forecasting::compose                     | Markus Löning & Martin Walter                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.transformations.panel.truncation.TruncationTransformer.html">TruncationTransformer</a>                            | transformations::panel                   | Aaron Bostrom                                                              |
-| <a href="https://www.sktime.org/en/latest/api_reference/modules/auto_generated/sktime.classification.dictionary_based.WEASEL.html">WEASEL</a>                                                           | classification::dictionary_based         | Patrick Schäfer & Arik Ermshaus                                            |
diff --git a/docs/requirements.txt b/docs/requirements.txt
index c4cc19bc6b8..b6abafc9234 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,7 +1,10 @@
 jupyter
-m2r2
-nbsphinx
+myst-parser
+nbsphinx==0.8.6
 numpydoc
-sphinx==3.2.*
-sphinx_rtd_theme
+pydata-sphinx-theme
+sphinx==4.1.1
+sphinx-gallery==0.6.0
+sphinx-panels==0.6.0
+sphinx_issues==1.2.0
 tabulate
diff --git a/docs/source/_static/fields.css b/docs/source/_static/css/fields.css
similarity index 100%
rename from docs/source/_static/fields.css
rename to docs/source/_static/css/fields.css
diff --git a/docs/source/_templates/apidoc/module.rst_t b/docs/source/_templates/apidoc/module.rst_t
deleted file mode 100644
index 0c394131045..00000000000
--- a/docs/source/_templates/apidoc/module.rst_t
+++ /dev/null
@@ -1,9 +0,0 @@
-{%- if show_headings %}
-{{- basename | e | heading(2) }}
-
-{% endif -%}
-.. automodule:: {{ qualname }}
-{%- for option in automodule_options %}
-   :{{ option }}:
-{%- endfor %}
-
diff --git a/docs/source/_templates/apidoc/package.rst_t b/docs/source/_templates/apidoc/package.rst_t
deleted file mode 100644
index 08bafcc2a83..00000000000
--- a/docs/source/_templates/apidoc/package.rst_t
+++ /dev/null
@@ -1,52 +0,0 @@
-{%- macro automodule(modname, options) -%}
-.. automodule:: {{ modname }}
-{%- for option in options %}
-   :{{ option }}:
-{%- endfor %}
-{%- endmacro %}
-
-{%- macro toctree(docnames) -%}
-.. toctree::
-{% for docname in docnames %}
-   {{ docname }}
-{%- endfor %}
-{%- endmacro %}
-
-{%- if is_namespace %}
-{{- [pkgname, "namespace"] | join(" ") | e | heading }}
-{% else %}
-{{- pkgname | e | heading(2) }}
-{% endif %}
-
-{%- if modulefirst and not is_namespace %}
-{{ automodule(pkgname, automodule_options) }}
-{% endif %}
-
-{%- if subpackages %}
-Subpackages
------------
-
-{{ toctree(subpackages) }}
-{% endif %}
-
-{%- if submodules %}
-Submodules
-----------
-{% if separatemodules %}
-{{ toctree(submodules) }}
-{%- else %}
-{%- for submodule in submodules %}
-{% if show_headings %}
-{{- submodule | e | heading(2) }}
-{% endif %}
-{{ automodule(submodule, automodule_options) }}
-{% endfor %}
-{%- endif %}
-{% endif %}
-
-{%- if not modulefirst and not is_namespace %}
-Module contents
----------------
-
-{{ automodule(pkgname, automodule_options) }}
-{% endif %}
diff --git a/docs/source/_templates/apidoc/toc.rst_t b/docs/source/_templates/apidoc/toc.rst_t
deleted file mode 100644
index f0877eeb2f8..00000000000
--- a/docs/source/_templates/apidoc/toc.rst_t
+++ /dev/null
@@ -1,8 +0,0 @@
-{{ header | heading }}
-
-.. toctree::
-   :maxdepth: {{ maxdepth }}
-{% for docname in docnames %}
-   {{ docname }}
-{%- endfor %}
-
diff --git a/docs/source/_templates/class.rst b/docs/source/_templates/class.rst
index 79ff2cf8077..e45bca4524c 100644
--- a/docs/source/_templates/class.rst
+++ b/docs/source/_templates/class.rst
@@ -1,4 +1,4 @@
-:mod:`{{module}}`.{{objname}}
+{{objname}}
 {{ underline }}==============
 
 .. currentmodule:: {{ module }}
diff --git a/docs/source/_templates/class_with_call.rst b/docs/source/_templates/class_with_call.rst
index 70e46d35831..9fc4b65ce10 100644
--- a/docs/source/_templates/class_with_call.rst
+++ b/docs/source/_templates/class_with_call.rst
@@ -1,14 +1,10 @@
-:mod:`{{module}}`.{{objname}}
+{{objname}}
 {{ underline }}===============
 
 .. currentmodule:: {{ module }}
 
 .. autoclass:: {{ objname }}
-
-   {% block methods %}
-   .. automethod:: __init__
-   .. automethod:: __call__
-   {% endblock %}
+   :special-members: __call__
 
 .. include:: {{module}}.{{objname}}.examples
 
diff --git a/docs/source/_templates/class_without_init.rst b/docs/source/_templates/class_without_init.rst
deleted file mode 100644
index 307b0199c30..00000000000
--- a/docs/source/_templates/class_without_init.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-:mod:`{{module}}`.{{objname}}
-{{ underline }}==============
-
-.. currentmodule:: {{ module }}
-
-.. autoclass:: {{ objname }}
diff --git a/docs/source/_templates/function.rst b/docs/source/_templates/function.rst
index f4b11eda770..61d178dcbfe 100644
--- a/docs/source/_templates/function.rst
+++ b/docs/source/_templates/function.rst
@@ -1,4 +1,4 @@
-:mod:`{{module}}`.{{objname}}
+{{objname}}
 {{ underline }}====================
 
 .. currentmodule:: {{ module }}
diff --git a/docs/source/_templates/module.rst b/docs/source/_templates/module.rst
deleted file mode 100644
index e5a4b5402d3..00000000000
--- a/docs/source/_templates/module.rst
+++ /dev/null
@@ -1,42 +0,0 @@
-.. _mod-{{ fullname }}:
-
-{{ fullname | underline }}
-
-.. automodule:: {{ fullname }}
-
-   {% block functions %}
-   {% if functions %}
-   .. rubric:: Functions
-
-   .. autosummary::
-      :toctree: {{ objname }}
-      :template: function.rst
-   {% for item in functions %}
-      {{ item }}
-   {%- endfor %}
-   {% endif %}
-   {% endblock %}
-
-   {% block classes %}
-   {% if classes %}
-   .. rubric:: Classes
-
-   .. autosummary::
-      :toctree: {{ objname }}
-      :template: class.rst
-   {% for item in classes %}
-      {{ item }}
-   {%- endfor %}
-   {% endif %}
-   {% endblock %}
-
-   {% block exceptions %}
-   {% if exceptions %}
-   .. rubric:: Exceptions
-
-   .. autosummary::
-   {% for item in exceptions %}
-      {{ item }}
-   {%- endfor %}
-   {% endif %}
-   {% endblock %}
diff --git a/docs/source/about.rst b/docs/source/about.rst
index ed4e54ce341..b21d3ce941a 100644
--- a/docs/source/about.rst
+++ b/docs/source/about.rst
@@ -1,130 +1,105 @@
 .. _about:
 
-About us
-========
+=====
+About
+=====
 
-Mission statement
------------------
+.. toctree::
+   :maxdepth: 1
+   :hidden:
 
-sktime enables understandable and composable machine learning with time
-series. It provides `scikit-learn <https://scikit-learn.org/stable/>`_
-compatible algorithms and model composition tools, supported by a clear
-taxonomy of learning tasks, with instructive documentation and a friendly community.
+   about/mission.md
+   about/contributors.md
+   about/history.md
+   about/funding.md
+   about/citation.md
+   about/artwork.md
 
+Learn more about the sktime project and its community.
 
-History
--------
+.. panels::
+    :card: + intro-card text-center
 
-sktime was started in April 2019 as a collaborative project between
-Franz Király, Markus Löning, Tony Bagnall and Jason
-Lines. In the first year, it grew into a community-driven project with
-contributions from researchers and practitioners from around the globe.
+    ---
 
+    Mission
+    ^^^^^^^
 
-Authors
--------
+    Learm more about sktime's mission.
 
-For an overview of current and past contributors, please see our
-:ref:`contributors page <contributors>`.
+    +++
 
+    .. link-button:: mission
+            :type: ref
+            :text: Our Mission
+            :classes: btn-block btn-secondary stretched-link
 
-Citing sktime
--------------
+    ---
 
-If you use sktime in a scientific publication, we would appreciate
-citations to the following paper:
+    Contributors
+    ^^^^^^^^^^^^
 
-* [paper] `Markus Löning, Anthony Bagnall, Sajaysurya Ganesh, Viktor Kazakov, Jason Lines, Franz Király (2019): “sktime: A Unified Interface for Machine Learning with Time Series” <http://learningsys.org/neurips19/assets/papers/sktime_ml_systems_neurips2019.pdf>`_
-* [software] `Markus Löning, Tony Bagnall, Sajaysurya Ganesh, George Oastler, Jason Lines, ViktorKaz, …, Aadesh Deshmukh (2020). alan-turing-institute/sktime. Zenodo. http://doi.org/10.5281/zenodo.3749000 <http://doi.org/10.5281/zenodo.3749000>`_
+    The wonderful people who make the project possible.
 
+    +++
 
-Artwork
--------
+    .. link-button:: contributors
+            :type: ref
+            :text: Contributors
+            :classes: btn-block btn-secondary stretched-link
 
-High-quality logos are available in the `docs/source/images/ <https://github.com/alan-turing-institute/sktime/tree/main/docs/source/images>`_ directory.
+    ---
 
-.. image:: images/sktime-logo-no-text.jpg
-   :align: center
+    History
+    ^^^^^^^
 
-Funding
--------
+    Learn how sktime got here.
 
-sktime is a community-driven project, however institutional and private
-grants help to assure its sustainability.
+    +++
 
-We would like to thank the following funders.
+    .. link-button:: history
+            :type: ref
+            :text: History
+            :classes: btn-block btn-secondary stretched-link
 
-...................................
+    ---
 
-.. raw:: html
+    Funding
+    ^^^^^^^
 
-   <div class="sk-sponsor-div">
-   <div class="sk-sponsor-div-box">
+    Thank you to sktime's supporters.
 
-`The Alan Turing Institute <https://turing.ac.uk>`_
-funded three months of the initial development under the UKRI Strategic
-Priorities Fund (EPSRC grant no EP/T001569/1), particularly the `Tools,
-Practices and Systems <https://www.turing.ac
-.uk/events/tools-practices-and-systems-data-science-and-artificial-intelligence-scoping-workshop>`_ theme within that grant.
+    +++
 
+    .. link-button:: funding
+            :type: ref
+            :text: Fund sktime
+            :classes: btn-block btn-secondary stretched-link
 
-.. raw:: html
+    ---
 
-   </div>
-   <div class="sk-sponsor-div-box">
+    Citation
+    ^^^^^^^^
 
-.. image:: images/the-alan-turing-institute.png
-   :width: 100pt
-   :target: https://turing.ac.uk/
+    Learn how to cite sktime.
 
-.. raw:: html
+    +++
 
-   </div>
-   </div>
+    .. link-button:: citation
+            :type: ref
+            :text: Citation
+            :classes: btn-block btn-secondary stretched-link
 
+    ---
 
-...................................
+    Artwork
+    ^^^^^^^
 
-.. raw:: html
+    Our logo and other graphics.
 
-   <div class="sk-sponsor-div">
-   <div class="sk-sponsor-div-box">
+    +++
 
-Markus Löning's contribution was supported by the `UK Economic and Social
-Research Council (ESRC) <https://esrc.ukri.org>`_, the `Consumer Data
-Research Centre (CDRC) <https://www.cdrc.ac.uk>`_, and the Enrichment
-Scheme at the `The Alan Turing Institute <https://turing.ac.uk>`_.
-
-
-.. raw:: html
-
-   </div>
-   <div class="sk-sponsor-div-box">
-
-.. image:: images/esrc-ukri.png
-   :width: 100pt
-   :target: https://esrc.ukri.org
-
-.. image:: images/cdrc.jpg
-   :width: 100pt
-   :target: https://www.cdrc.ac.uk
-
-.. raw:: html
-
-   </div>
-   </div>
-
-
-Sprints
--------
-
-The `2019 joint sktime MLJ development sprint <https://github.com/sktime/sktime-workshops/tree/master/previous_workshops/2019_sktime_MLJ_joint_dev_sprint>`_ was kindly hosted by `UCL
-<https://www.ucl.ac.uk>`_ and `The Alan Turing Institute <https://turing.ac
-.uk>`_. Some participants could attend thanks to the
-initial funding of the `The Alan Turing Institute <https://turing.ac.uk>`_.
-
-
-Infrastructure support
-----------------------
-
-We would also like to thank `Microsoft Azure <https://azure.microsoft.com/en-gb/services/devops/>`_, `GitHub Actions <https://docs.github.com/en/free-pro-team@latest/actions>`_, and `AppVeyor <https://www.appveyor.com>`_, `ReadtheDocs <https://readthedocs.org>`_ for the free computing time on their Continuous Integration servers.
+    .. link-button:: artwork
+            :type: ref
+            :text: Artwork
+            :classes: btn-block btn-secondary stretched-link
diff --git a/docs/source/about/artwork.md b/docs/source/about/artwork.md
new file mode 100644
index 00000000000..c3c390c2584
--- /dev/null
+++ b/docs/source/about/artwork.md
@@ -0,0 +1,7 @@
+# Artwork
+
+High-quality logos are available in the [`docs/source/images/`](https://github.com/alan-turing-institute/sktime/tree/main/docs/source/images) directory on GitHub.
+
+```{image} ../images/sktime-logo-no-text.jpg
+:align: center
+```
diff --git a/docs/source/about/citation.md b/docs/source/about/citation.md
new file mode 100644
index 00000000000..f5e199800b0
--- /dev/null
+++ b/docs/source/about/citation.md
@@ -0,0 +1,7 @@
+# Citing sktime
+
+If you use sktime in a scientific publication, we would appreciate
+citations to the following paper:
+
+* [`Markus Löning, Anthony Bagnall, Sajaysurya Ganesh, Viktor Kazakov, Jason Lines, Franz Király (2019): “sktime: A Unified Interface for Machine Learning with Time Series”](http://learningsys.org/neurips19/assets/papers/sktime_ml_systems_neurips2019.pdf)
+* [Markus Löning, Tony Bagnall, Sajaysurya Ganesh, George Oastler, Jason Lines, ViktorKaz, …, Aadesh Deshmukh (2020). alan-turing-institute/sktime. Zenodo. http://doi.org/10.5281/zenodo.3749000](http://doi.org/10.5281/zenodo.3749000)
diff --git a/docs/source/about/contributors.md b/docs/source/about/contributors.md
new file mode 100644
index 00000000000..362e062fc5d
--- /dev/null
+++ b/docs/source/about/contributors.md
@@ -0,0 +1,2 @@
+```{include} ../../../CONTRIBUTORS.md
+```
diff --git a/docs/source/about/funding.md b/docs/source/about/funding.md
new file mode 100644
index 00000000000..11def470908
--- /dev/null
+++ b/docs/source/about/funding.md
@@ -0,0 +1,48 @@
+# Funding
+
+sktime is a community-driven project, however institutional and private grants help to assure its sustainability.
+
+We would like to thank the following supporters.
+
+## Research grants
+
+[The Alan Turing Institute] funded three months of the initial development under the UKRI Strategic
+Priorities Fund (EPSRC grant no EP/T001569/1), particularly the [Tools,
+Practices and Systems](https://www.turing.ac.uk/events/tools-practices-and-systems-data-science-and-artificial-intelligence-scoping-workshop) theme within that grant.
+
+Markus Löning's contribution was supported by the [UK Economic and Social
+Research Council (ESRC)](https://esrc.ukri.org), the [Consumer Data
+Research Centre (CDRC)](https://www.cdrc.ac.uk), and the Enrichment
+Scheme at the [The Alan Turing Institute].
+
+```{image} ../images/the-alan-turing-institute.png
+:width: 100pt
+:target: https://turing.ac.uk/
+```
+
+```{image} ../images/esrc-ukri.png
+:width: 100pt
+:target: https://esrc.ukri.org
+```
+
+```{image} ../images/cdrc.jpg
+:width: 100pt
+:target: https://www.cdrc.ac.uk
+```
+
+## Institutional sponsorship
+
+The [2019 joint sktime MLJ development sprint](https://github.com/sktime/sktime-workshops/tree/master/previous_workshops/2019_sktime_MLJ_joint_dev_sprint) was kindly hosted by [UCL] and [The Alan Turing Institute]. Some participants could attend thanks to the
+initial funding of the [The Alan Turing Institute].
+
+## Infrastructure support
+
+We would also like to thank [Microsoft Azure], [GitHub Actions], [AppVeyor] and [ReadtheDocs] for the free compute time on their servers.
+
+[microsoft azure]: https://azure.microsoft.com/en-gb/services/devops/
+[github actions]: https://docs.github.com/en/free-pro-team@latest/actions
+[appveyor]: https://www.appveyor.com
+[readthedocs]: https://readthedocs.org
+
+[the alan turing institute]: https://turing.ac.uk
+[ucl]: https://www.ucl.ac.uk
diff --git a/docs/source/about/history.md b/docs/source/about/history.md
new file mode 100644
index 00000000000..831ce9a74f9
--- /dev/null
+++ b/docs/source/about/history.md
@@ -0,0 +1,11 @@
+# History
+
+sktime was started in April 2019 as a collaborative project between Franz Király, Markus Löning, Anthony Bagnall and Jason Lines.
+In the first year, it grew into a community-driven project with contributions from researchers and practitioners from around the globe.
+
+Today, sktime continues to undergo rapid development to refine its API, while adding new features and algorithms for a range of time series machine learning tasks.
+Development is supported by the original project members, new core developers and the broader community (see [contributors]).
+
+If your interested in contributing, you can find out how you can contribute in our developer information.
+
+[contributors]: contributors.md
diff --git a/docs/source/about/mission.md b/docs/source/about/mission.md
new file mode 100644
index 00000000000..5120dd4430c
--- /dev/null
+++ b/docs/source/about/mission.md
@@ -0,0 +1,7 @@
+# Mission
+
+sktime enables understandable and composable machine learning with time
+series. It provides [scikit-learn] compatible algorithms and model composition tools, supported by a clear
+taxonomy of learning tasks, with instructive documentation and a friendly community.
+
+[scikit-learn]: https://scikit-learn.org/stable/
diff --git a/docs/source/api_reference.rst b/docs/source/api_reference.rst
index aa95926c975..38fc7f4e81e 100644
--- a/docs/source/api_reference.rst
+++ b/docs/source/api_reference.rst
@@ -4,20 +4,25 @@
 API Reference
 =============
 
-This is the class and function reference for ``sktime``.
+Welcome to the API reference for ``sktime``.
 
-.. autosummary::
-    :toctree: modules/auto_generated/
+The API reference provides a technical manual.
+It describes the classes and functions included in sktime.
+For a scientific manual, see the :ref:`user_guide`.
 
 .. include:: includes/api_css.rst
 
-.. include:: api_reference/classification.rst
-.. include:: api_reference/regression.rst
-.. include:: api_reference/series_as_features.rst
-.. include:: api_reference/forecasting.rst
-.. include:: api_reference/annotation.rst
-.. include:: api_reference/transformations.rst
-.. include:: api_reference/performance_metrics.rst
-.. include:: api_reference/datasets.rst
-.. include:: api_reference/utils.rst
-.. include:: api_reference/exceptions.rst
+.. toctree::
+    :maxdepth: 1
+
+    api_reference/base
+    api_reference/forecasting
+    api_reference/annotation
+    api_reference/classification
+    api_reference/regression
+    api_reference/series_as_features
+    api_reference/transformations
+    api_reference/performance_metrics
+    api_reference/datasets
+    api_reference/utils
+    api_reference/exceptions
diff --git a/docs/source/api_reference/annotation.rst b/docs/source/api_reference/annotation.rst
index 7a6a829fd4d..6c28f5f2309 100644
--- a/docs/source/api_reference/annotation.rst
+++ b/docs/source/api_reference/annotation.rst
@@ -1,13 +1,15 @@
 .. _annotation_ref:
 
-sktime.annotation: Time series annotation
-=========================================
+Time series annotation
+======================
 
 The :mod:`sktime.annotation` module contains algorithms and composition tools
 for time series annotation (for example, anomaly or outlier detection).
 
 .. automodule:: sktime.annotation
-    :no-members:
+   :no-members:
+   :no-inherited-members:
+
 
 Adapters
 --------
@@ -15,7 +17,7 @@ Adapters
 .. currentmodule:: sktime.annotation.adapters
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     PyODAnnotator
diff --git a/docs/source/api_reference/base.rst b/docs/source/api_reference/base.rst
new file mode 100644
index 00000000000..c8f24a0a307
--- /dev/null
+++ b/docs/source/api_reference/base.rst
@@ -0,0 +1,22 @@
+.. _base_ref:
+
+Base
+====
+
+The :mod:`sktime.base` module contains abstract base classes.
+
+.. automodule:: sktime.base
+    :no-members:
+    :no-inherited-members:
+
+Base classes
+------------
+
+.. currentmodule:: sktime.base
+
+.. autosummary::
+    :toctree: auto_generated/
+    :template: class.rst
+
+    BaseObject
+    BaseEstimator
diff --git a/docs/source/api_reference/classification.rst b/docs/source/api_reference/classification.rst
index ddbf2d9a58c..590cf86dade 100644
--- a/docs/source/api_reference/classification.rst
+++ b/docs/source/api_reference/classification.rst
@@ -1,12 +1,14 @@
 .. _classification_ref:
 
-sktime.classification: Time series classification
-=================================================
+Time series classification
+==========================
 
 The :mod:`sktime.classification` module contains algorithms and composition tools for time series classification.
 
 .. automodule:: sktime.classification
     :no-members:
+    :no-inherited-members:
+
 
 Composition
 -----------
@@ -14,7 +16,7 @@ Composition
 .. currentmodule:: sktime.classification.compose
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     ColumnEnsembleClassifier
@@ -25,7 +27,7 @@ Dictionary-based
 .. currentmodule:: sktime.classification.dictionary_based
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     IndividualBOSS
@@ -42,7 +44,7 @@ Distance-based
 .. currentmodule:: sktime.classification.distance_based
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     KNeighborsTimeSeriesClassifier
@@ -57,7 +59,7 @@ Hybrid
 .. currentmodule:: sktime.classification.hybrid
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     HIVECOTEV1
@@ -68,7 +70,7 @@ Interval-based
 .. currentmodule:: sktime.classification.interval_based
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     TimeSeriesForestClassifier
@@ -83,19 +85,19 @@ Shapelet-based
 .. currentmodule:: sktime.classification.shapelet_based
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     ShapeletTransformClassifier
     MrSEQLClassifier
 
 Kernel-based
---------------
+------------
 
 .. currentmodule:: sktime.classification.kernel_based
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     ROCKETClassifier
@@ -107,7 +109,7 @@ Feature-based
 .. currentmodule:: sktime.classification.feature_based
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     Catch22Classifier
diff --git a/docs/source/api_reference/datasets.rst b/docs/source/api_reference/datasets.rst
index 1ce0b562255..0263e40c0bd 100644
--- a/docs/source/api_reference/datasets.rst
+++ b/docs/source/api_reference/datasets.rst
@@ -1,12 +1,16 @@
 .. _datasets_ref:
 
-sktime.datasets: Datasets
-=========================
+Datasets
+========
 
-.. currentmodule:: sktime.datasets.base
+.. automodule:: sktime.datasets
+    :no-members:
+    :no-inherited-members:
+
+.. currentmodule:: sktime.datasets
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: function.rst
 
     load_airline
diff --git a/docs/source/api_reference/exceptions.rst b/docs/source/api_reference/exceptions.rst
index 2ccb53b3d8f..2d18760405e 100644
--- a/docs/source/api_reference/exceptions.rst
+++ b/docs/source/api_reference/exceptions.rst
@@ -1,14 +1,18 @@
 .. _exceptions_ref:
 
-sktime.exceptions: Exceptions
-=============================
+Exceptions
+==========
 
 The :mod:`sktime.exceptions` module contains classes for exceptions and warnings.
 
+.. automodule:: sktime.exceptions
+    :no-members:
+    :no-inherited-members:
+
 .. currentmodule:: sktime.exceptions
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     NotEvaluatedError
diff --git a/docs/source/api_reference/forecasting.rst b/docs/source/api_reference/forecasting.rst
index d970284b95d..bb20f0b18d7 100644
--- a/docs/source/api_reference/forecasting.rst
+++ b/docs/source/api_reference/forecasting.rst
@@ -1,13 +1,15 @@
 
 .. _forecasting_ref:
 
-sktime.forecasting: Time series forecasting
-===========================================
+Forecasting
+===========
 
 The :mod:`sktime.forecasting` module contains algorithms and composition tools for forecasting.
 
 .. automodule:: sktime.forecasting
     :no-members:
+    :no-inherited-members:
+
 
 Base
 ----
@@ -15,7 +17,7 @@ Base
 .. currentmodule:: sktime.forecasting.base
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     ForecastingHorizon
@@ -26,7 +28,7 @@ Naive
 .. currentmodule:: sktime.forecasting.naive
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     NaiveForecaster
@@ -37,7 +39,7 @@ Trend
 .. currentmodule:: sktime.forecasting.trend
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     PolynomialTrendForecaster
@@ -48,7 +50,7 @@ Exponential Smoothing
 .. currentmodule:: sktime.forecasting.exp_smoothing
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     ExponentialSmoothing
@@ -56,7 +58,7 @@ Exponential Smoothing
 .. currentmodule:: sktime.forecasting.ets
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     AutoETS
@@ -67,7 +69,7 @@ ARIMA
 .. currentmodule:: sktime.forecasting.arima
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     AutoARIMA
@@ -79,7 +81,7 @@ Theta
 .. currentmodule:: sktime.forecasting.theta
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     ThetaForecaster
@@ -90,7 +92,7 @@ BATS/TBATS
 .. currentmodule:: sktime.forecasting.bats
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     BATS
@@ -98,18 +100,29 @@ BATS/TBATS
 .. currentmodule:: sktime.forecasting.tbats
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     TBATS
 
+Croston
+-------
+
+.. currentmodule:: sktime.forecasting.croston
+
+.. autosummary::
+    :toctree: auto_generated/
+    :template: class.rst
+
+    Croston
+
 Prophet
 -------
 
 .. currentmodule:: sktime.forecasting.fbprophet
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     Prophet
@@ -120,7 +133,7 @@ Composition
 .. currentmodule:: sktime.forecasting.compose
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     ColumnEnsembleForecaster
@@ -139,7 +152,7 @@ Composition
     MultiplexForecaster
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: function.rst
 
     make_reduction
@@ -150,7 +163,7 @@ Online Forecasting
 .. currentmodule:: sktime.forecasting.online_learning
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     OnlineEnsembleForecaster
@@ -163,7 +176,7 @@ Model Selection
 .. currentmodule:: sktime.forecasting.model_selection
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     CutoffSplitter
@@ -174,18 +187,18 @@ Model Selection
     ForecastingRandomizedSearchCV
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: function.rst
 
     temporal_train_test_split
 
 Model Evaluation (Backtesting)
-----------------
+------------------------------
 
 .. currentmodule:: sktime.forecasting.model_evaluation
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: function.rst
 
     evaluate
diff --git a/docs/source/api_reference/performance_metrics.rst b/docs/source/api_reference/performance_metrics.rst
index 763755740bf..3d6858bb0b6 100644
--- a/docs/source/api_reference/performance_metrics.rst
+++ b/docs/source/api_reference/performance_metrics.rst
@@ -1,23 +1,25 @@
 
 .. _performance_metric_ref:
 
-sktime.performance_metrics: Measuring time series model performance
-=======================================================
+Performance metrics
+===================
 
 The :mod:`sktime.performance_metrics` module contains metrics for evaluating and tuning time series models.
 
 .. automodule:: sktime.performance_metrics
     :no-members:
+    :no-inherited-members:
 
 Forecasting
 -----------
 
 .. currentmodule:: sktime.performance_metrics.forecasting
 
-Tunable Classes
-***************
+Classes
+~~~~~~~
+
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class_with_call.rst
 
     MeanAbsoluteScaledError
@@ -40,9 +42,10 @@ Tunable Classes
     RelativeLoss
 
 Functions
-*********
+~~~~~~~~~
+
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: function.rst
 
     make_forecasting_scorer
diff --git a/docs/source/api_reference/regression.rst b/docs/source/api_reference/regression.rst
index 197e20f05aa..de1d8e51e88 100644
--- a/docs/source/api_reference/regression.rst
+++ b/docs/source/api_reference/regression.rst
@@ -1,12 +1,13 @@
 .. _regression_ref:
 
-sktime.regression: Time series regression
-=========================================
+Time series regression
+======================
 
 The :mod:`sktime.regression` module contains algorithms and composition tools for time series regression.
 
 .. automodule:: sktime.regression
     :no-members:
+    :no-inherited-members:
 
 Composition
 -----------
@@ -14,7 +15,7 @@ Composition
 .. currentmodule:: sktime.regression.compose
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     ComposableTimeSeriesForestRegressor
@@ -25,7 +26,7 @@ Interval-based
 .. currentmodule:: sktime.regression.interval_based
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     TimeSeriesForestRegressor
diff --git a/docs/source/api_reference/series_as_features.rst b/docs/source/api_reference/series_as_features.rst
index ab5d1d9bffb..09cec5aefd1 100644
--- a/docs/source/api_reference/series_as_features.rst
+++ b/docs/source/api_reference/series_as_features.rst
@@ -1,13 +1,14 @@
 
 .. _series_as_features_ref:
 
-sktime.series_as_features: Series-as-features tools
-===================================================
+Series-as-features tools
+========================
 
 The :mod:`sktime.series_as_features` module contains algorithms and composition tools that are shared by the classification and regression modules.
 
 .. automodule:: sktime.series_as_features
     :no-members:
+    :no-inherited-members:
 
 Composition
 -----------
@@ -15,7 +16,7 @@ Composition
 .. currentmodule:: sktime.series_as_features.compose
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     FeatureUnion
@@ -26,7 +27,7 @@ Model selection
 .. currentmodule:: sktime.series_as_features.model_selection
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     PresplitFilesCV
diff --git a/docs/source/api_reference/transformations.rst b/docs/source/api_reference/transformations.rst
index 622ca22880c..36b13acbe94 100644
--- a/docs/source/api_reference/transformations.rst
+++ b/docs/source/api_reference/transformations.rst
@@ -1,14 +1,14 @@
 .. _transformations_ref:
 
-sktime.transformations: Time series transformers
-=============================================
+Time series transformations
+===========================
 
 The :mod:`sktime.transformations` module contains classes for data
 transformations.
 
 .. automodule:: sktime.transformations
-   :members:
-   :inherited-members:
+   :no-members:
+   :no-inherited-members:
 
 Panel transformers
 ------------------
@@ -19,7 +19,7 @@ Dictionary-based
 .. currentmodule:: sktime.transformations.panel.dictionary_based
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     PAA
@@ -32,7 +32,7 @@ Summarize
 .. currentmodule:: sktime.transformations.panel.summarize
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     DerivativeSlopeTransformer
@@ -46,7 +46,7 @@ tsfresh
 .. currentmodule:: sktime.transformations.panel.tsfresh
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     TSFreshRelevantFeatureExtractor
@@ -58,7 +58,7 @@ Catch22
 .. currentmodule:: sktime.transformations.panel.catch22
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     Catch22
@@ -69,7 +69,7 @@ Compose
 .. currentmodule:: sktime.transformations.panel.compose
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     ColumnTransformer
@@ -78,7 +78,7 @@ Compose
     SeriesToPrimitivesRowTransformer
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: function.rst
 
     make_row_transformer
@@ -89,7 +89,7 @@ Matrix profile
 .. currentmodule:: sktime.transformations.panel.matrix_profile
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     MatrixProfile
@@ -100,7 +100,7 @@ PCA
 .. currentmodule:: sktime.transformations.panel.pca
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     PCATransformer
@@ -111,7 +111,7 @@ Reduce
 .. currentmodule:: sktime.transformations.panel.reduce
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     Tabularizer
@@ -122,7 +122,7 @@ Rocket
 .. currentmodule:: sktime.transformations.panel.rocket
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     Rocket
@@ -135,7 +135,7 @@ Segment
 .. currentmodule:: sktime.transformations.panel.segment
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     IntervalSegmenter
@@ -147,7 +147,7 @@ Shapelet
 .. currentmodule:: sktime.transformations.panel.shapelets
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     ShapeletTransform
@@ -159,7 +159,7 @@ Signature
 .. currentmodule:: sktime.transformations.panel.signature_based
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     SignatureTransformer
@@ -173,7 +173,7 @@ Detrend
 .. currentmodule:: sktime.transformations.series.detrend
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     Detrender
@@ -186,7 +186,7 @@ Adapt
 .. currentmodule:: sktime.transformations.series.adapt
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     TabularToSeriesAdaptor
@@ -197,7 +197,7 @@ Box-cox
 .. currentmodule:: sktime.transformations.series.boxcox
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     BoxCoxTransformer
@@ -209,7 +209,7 @@ Auto-correlation
 .. currentmodule:: sktime.transformations.series.acf
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     AutoCorrelationTransformer
@@ -221,7 +221,7 @@ Cosine
 .. currentmodule:: sktime.transformations.series.cos
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     CosineTransformer
@@ -232,7 +232,7 @@ Exponent
 .. currentmodule:: sktime.transformations.series.exponent
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     ExponentTransformer
@@ -244,40 +244,40 @@ Matrix Profile
 .. currentmodule:: sktime.transformations.series.matrix_profile
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     MatrixProfileTransformer
 
 Missing value imputation
-~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~s
 
 .. currentmodule:: sktime.transformations.series.impute
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     Imputer
 
 Outlier detection
-~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~
 
 .. currentmodule:: sktime.transformations.series.outlier_detection
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     HampelFilter
 
 Composition
-~~~~~~~~~~~~~~
+~~~~~~~~~~~
 
 .. currentmodule:: sktime.transformations.series.compose
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     OptionalPassthrough
@@ -289,7 +289,7 @@ Theta
 .. currentmodule:: sktime.transformations.series.theta
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: class.rst
 
     ThetaLinesTransformer
diff --git a/docs/source/api_reference/utils.rst b/docs/source/api_reference/utils.rst
index ee1e29972a0..0bce14c70c7 100644
--- a/docs/source/api_reference/utils.rst
+++ b/docs/source/api_reference/utils.rst
@@ -1,17 +1,21 @@
 .. _utils_ref:
 
-sktime.utils: Utility function
-==============================
+Utility functions
+=================
 
 The :mod:`sktime.utils` module contains utility functions.
 
+.. automodule:: sktime.utils
+    :no-members:
+    :no-inherited-members:
+
 Plotting
 --------
 
 .. currentmodule:: sktime.utils.plotting
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: function.rst
 
     plot_series
@@ -23,7 +27,7 @@ Data Processing
 .. currentmodule:: sktime.datatypes._panel._convert
 
 .. autosummary::
-    :toctree: modules/auto_generated/
+    :toctree: auto_generated/
     :template: function.rst
 
     are_columns_nested
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 176595d14cd..d501a8a8683 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -25,7 +25,7 @@
 
 # -- Project information -----------------------------------------------------
 project = "sktime"
-copyright = "2019 - 2020 (BSD-3-Clause License)"
+copyright = "2019 - 2021 (BSD-3-Clause License)"
 author = "sktime developers"
 
 # The full version, including alpha/beta/rc tags
@@ -46,18 +46,19 @@
 extensions = [
     "sphinx.ext.autodoc",
     "sphinx.ext.autosummary",
+    "numpydoc",
     "sphinx.ext.intersphinx",
-    "sphinx.ext.autosectionlabel",
-    "sphinx.ext.todo",
-    "sphinx.ext.mathjax",
-    # 'sphinx.ext.viewcode',  # link to auto-generated source code files (rst)
-    "sphinx.ext.githubpages",
     "sphinx.ext.linkcode",  # link to GitHub source code via linkcode_resolve()
-    "sphinx.ext.napoleon",
     "nbsphinx",  # integrates example notebooks
-    "m2r2",  # markdown rendering
+    "sphinx_gallery.load_style",
+    "myst_parser",
+    "sphinx_panels",
+    "sphinx_issues",
 ]
 
+# Use bootstrap CSS from theme.
+panels_add_bootstrap_css = False
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
 
@@ -81,22 +82,53 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ["_build", ".ipynb_checkpoints", "Thumbs.db", ".DS_Store"]
+exclude_patterns = [
+    "_build",
+    ".ipynb_checkpoints",
+    "Thumbs.db",
+    ".DS_Store",
+]
+
+add_module_names = False
 
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = "sphinx"
 
 # see http://stackoverflow.com/q/12206334/562769
 numpydoc_show_class_members = True
+# this is needed for some reason...
+# see https://github.com/numpy/numpydoc/issues/69
 numpydoc_class_members_toctree = False
 
+numpydoc_validation_checks = {"all"}
+
 # generate autosummary even if no references
 autosummary_generate = True
-autodoc_default_flags = ["members", "inherited-members"]
+
+# Members and inherited-members default to showing methods and attributes from a
+# class or those inherited.
+# Member-order orders the documentation in the order of how the members are defined in
+# the source code.
+autodoc_default_options = {
+    "members": True,
+    "inherited-members": True,
+    "member-order": "bysource",
+}
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+add_function_parentheses = False
+
+# When building HTML using the sphinx.ext.mathjax (enabled by default),
+# Myst-Parser injects the tex2jax_ignore (MathJax v2) and mathjax_ignore (MathJax v3)
+# classes in to the top-level section of each MyST document, and adds some default
+# configuration. This ensures that MathJax processes only math, identified by the
+# dollarmath and amsmath extensions, or specified in math directives. We here silence
+# the corresponding warning that this override happens.
+suppress_warnings = ["myst.mathjax"]
 
 
 def linkcode_resolve(domain, info):
-    """Return URL to source code correponding.
+    """Return URL to source code corresponding.
 
     Parameters
     ----------
@@ -139,18 +171,54 @@ def find_source():
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 
-html_theme = "sphinx_rtd_theme"
-# html_theme = 'bootstrap'
+html_theme = "pydata_sphinx_theme"
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
 
 html_theme_options = {
-    "prev_next_buttons_location": None,
+    "icon_links": [
+        {
+            "name": "GitHub",
+            "url": "https://github.com/alan-turing-institute/sktime",
+            "icon": "fab fa-github",
+        },
+        {
+            "name": "Twitter",
+            "url": "https://twitter.com/sktime_toolbox",
+            "icon": "fab fa-twitter",
+        },
+        {
+            "name": "Discord",
+            "url": "https://discord.com/invite/gqSab2K",
+            "icon": "fab fa-discord",
+        },
+    ],
+    "favicons": [
+        {
+            "rel": "icon",
+            "sizes": "16x16",
+            "href": "images/sktime-favicon.ico",
+        }
+    ],
+    "show_prev_next": False,
+    "use_edit_page_button": False,
+    "navbar_start": ["navbar-logo"],
+    "navbar_center": ["navbar-nav"],
+    "navbar_end": ["navbar-icon-links"],
+}
+html_logo = "images/sktime-logo-text-horizontal.png"
+html_context = {
+    "github_user": "alan-turing-institute",
+    "github_repo": "sktime",
+    "github_version": "main",
+    "doc_path": "docs/source/",
 }
-
 html_favicon = "images/sktime-favicon.ico"
+html_sidebars = {
+    "**": ["search-field.html", "sidebar-nav-bs.html", "sidebar-ethical-ads.html"]
+}
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
@@ -322,32 +390,40 @@ def adds(pth):
 nbsphinx_timeout = 600  # seconds, set to -1 to disable timeout
 
 # add Binder launch buttom at the top
-CURRENT_FILE = "{{ env.doc2path( env.docname, base=None) }}"
+current_file = "{{ env.doc2path( env.docname, base=None) }}"
 
 # make sure Binder points to latest stable release, not main
-BINDER_URL = f"https://mybinder.org/v2/gh/alan-turing-institute/sktime/{CURRENT_VERSION}?filepath={CURRENT_FILE}"  # noqa
+binder_url = f"https://mybinder.org/v2/gh/alan-turing-institute/sktime/{CURRENT_VERSION}?filepath={current_file}"  # noqa
 nbsphinx_prolog = f"""
 .. |binder| image:: https://mybinder.org/badge_logo.svg
-.. _Binder: {BINDER_URL}
+.. _Binder: {binder_url}
 
 |Binder|_
 """
 
 # add link to original notebook at the bottom
-NOTEBOOK_URL = f"https://github.com/alan-turing-institute/sktime/tree/{CURRENT_VERSION}/{CURRENT_FILE}"  # noqa
+notebook_url = f"https://github.com/alan-turing-institute/sktime/tree/{CURRENT_VERSION}/{current_file}"  # noqa
 nbsphinx_epilog = f"""
 ----
 
-Generated by nbsphinx_. The Jupyter notebook can be found here_.
+Generated using nbsphinx_. The Jupyter notebook can be found here_.
 
-.. _here: {NOTEBOOK_URL}
+.. _here: {notebook_url}
 .. _nbsphinx: https://nbsphinx.readthedocs.io/
 """
 
 # -- Options for intersphinx extension ---------------------------------------
 
 # Example configuration for intersphinx: refer to the Python standard library.
-intersphinx_mapping = {"https://docs.python.org/": None}
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/{.major}".format(sys.version_info), None),
+    "numpy": ("https://docs.scipy.org/doc/numpy/", None),
+    "scipy": ("https://docs.scipy.org/doc/scipy/reference", None),
+    "matplotlib": ("https://matplotlib.org/", None),
+    "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
+    "joblib": ("https://joblib.readthedocs.io/en/latest/", None),
+    "scikit-learn": ("https://scikit-learn.org/stable/", None),
+}
 
 # -- Options for _todo extension ----------------------------------------------
 todo_include_todos = False
diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst
deleted file mode 100644
index 3fb74dd66b5..00000000000
--- a/docs/source/contributing.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-.. _contributing:
-
-.. mdinclude:: ../../CONTRIBUTING.md
diff --git a/docs/source/contributors.rst b/docs/source/contributors.rst
deleted file mode 100644
index 9c8e0e8a1e8..00000000000
--- a/docs/source/contributors.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-.. _contributors:
-
-.. mdinclude:: ../../CONTRIBUTORS.md
diff --git a/docs/source/developer_guide.rst b/docs/source/developer_guide.rst
index 47be4aacd49..361fdf418f8 100644
--- a/docs/source/developer_guide.rst
+++ b/docs/source/developer_guide.rst
@@ -17,8 +17,9 @@ Welcome to sktime's developer guide!
 
 .. toctree::
    :maxdepth: 1
+   :glob:
 
    developer_guide/introduction
-   developer_guide/forecasting
-   developer_guide/classification
    developer_guide/add_dataset
+   developer_guide/add_estimators
+   developer_guide/documentation
diff --git a/docs/source/developer_guide/add_dataset.rst b/docs/source/developer_guide/add_dataset.rst
index 5582da7f6e9..6fc0844604f 100644
--- a/docs/source/developer_guide/add_dataset.rst
+++ b/docs/source/developer_guide/add_dataset.rst
@@ -1,13 +1,13 @@
-.. _developer_guide_forecasting:
+.. _developer_guide_add_datset:
+
+====================
+Adding a New Dataset
+====================
 
-Adding Datasets to sktime
-=========================
 Follow these steps to add a new dataset to sktime:
 
-*  Include CSV file or supported other format under :code:`sktime/datasets/data/<dataset-name>`
+*  Include CSV file or other supported format under :code:`sktime/datasets/data/<dataset-name>`
 *  Add :code:`load_<dataset-name>(...)` function in file :code:`sktime/datasets/base.py`
-*  Add :code:`<dataset-name>` to the list :code:`__all__ = [...` in file :code:`sktime/datasets/__init__.py`
+*  Add :code:`<dataset-name>` to the list :code:`__all__ = [...]` in file :code:`sktime/datasets/__init__.py`
 *  Add :code:`<dataset-name>` as argument to method :code:`included_datasets = (...` in file :code:`sktime/sktime/datasets/setup.py`
 *  Add :code:`<dataset-name>` to the list of included problems in file :code:`sktime/sktime/datasets/setup.py`
-
-Thank you for your contribution!
diff --git a/docs/source/developer_guide/add_estimators.rst b/docs/source/developer_guide/add_estimators.rst
new file mode 100644
index 00000000000..04d3ef9bb89
--- /dev/null
+++ b/docs/source/developer_guide/add_estimators.rst
@@ -0,0 +1,14 @@
+.. _developer_guide_add_estimators:
+
+======================
+Adding a New Estimator
+======================
+
+Please use the extension templates below to implement new estimators. In addition to following the templates,
+please ensure that code also meets ``sktime's`` :ref:`documentation <developer_guide_documentation>` standards.
+
+Forecasting
+===========
+
+.. literalinclude:: ../../../extension_templates/forecasting.py
+    :language: python
diff --git a/docs/source/developer_guide/classification.rst b/docs/source/developer_guide/classification.rst
deleted file mode 100644
index dd03b1ea716..00000000000
--- a/docs/source/developer_guide/classification.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-.. _developer_guide_classification:
-
-Time Series Classification
-==========================
-
-.. note::
-
-    The developer guide is under development. We have created a basic
-    structure and are looking for contributions to develop the guide
-    further. For more details, please go to issue `#464 <https://github
-    .com/alan-turing-institute/sktime/issues/464>`_ on GitHub.
diff --git a/docs/source/developer_guide/documentation.rst b/docs/source/developer_guide/documentation.rst
new file mode 100644
index 00000000000..953097c6a00
--- /dev/null
+++ b/docs/source/developer_guide/documentation.rst
@@ -0,0 +1,19 @@
+.. _developer_guide_documentation:
+
+=============
+Documentation
+=============
+
+Providing instructive documentation is a key part of ``sktime's`` mission. In order to meet this,
+developers are expected to follow ``sktime's`` documentation standards.
+
+These include:
+
+* Documenting code using NumPy docstrings
+* Following ``sktime's`` docstring convention for public code artifacts and modules
+* Adding new public functionality to the :ref:`api_refernce` and :ref:`user guide <user_guide>`
+
+More detailed information on ``sktime's`` documentation format is provided below.
+
+Docstring Conventions
+=====================
diff --git a/docs/source/developer_guide/forecasting.rst b/docs/source/developer_guide/forecasting.rst
deleted file mode 100644
index 33049f8e42b..00000000000
--- a/docs/source/developer_guide/forecasting.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-.. _developer_guide_forecasting:
-
-Forecasting
-===========
-
-.. note::
-
-    The developer guide is under development. We have created a basic
-    structure and are looking for contributions to develop the guide
-    further. For more details, please go to issue `#464 <https://github
-    .com/alan-turing-institute/sktime/issues/464>`_ on GitHub.
diff --git a/docs/source/developers.rst b/docs/source/developers.rst
new file mode 100644
index 00000000000..30d2ab57e67
--- /dev/null
+++ b/docs/source/developers.rst
@@ -0,0 +1,106 @@
+.. _developers:
+
+Development
+===========
+
+.. note::
+
+   If you are new to sktime, it may be helpful to take a look at the
+   :ref:`get_involved` page first.
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+
+   developer_guide
+   reviewer_guide
+   enhancement_proposals
+   roadmap
+   code_of_conduct
+   governance
+
+.. panels::
+    :card: + intro-card text-center
+
+    ---
+
+    Developer Guide
+    ^^^^^^^^^^^^^^^
+
+    Learn our development conventions.
+
+    +++
+
+    .. link-button:: developer_guide
+            :type: ref
+            :text: Developer Guide
+            :classes: btn-block btn-secondary stretched-link
+
+    ---
+
+    Reviewer Guide
+    ^^^^^^^^^^^^^^
+
+    How we review contributions.
+
+    +++
+
+    .. link-button:: reviewer_guide
+            :type: ref
+            :text: Reviewer Guide
+            :classes: btn-block btn-secondary stretched-link
+
+    ---
+
+    Enhancement Proposals
+    ^^^^^^^^^^^^^^^^^^^^^
+
+    Thought of a project enhancement? See when and how to submit a proposal.
+
+    +++
+
+    .. link-button:: enhancement_proposals
+            :type: ref
+            :text: Enhancement Proposals
+            :classes: btn-block btn-secondary stretched-link
+
+    ---
+
+    Roadmap
+    ^^^^^^^
+
+    What's on the development horizon?
+
+    +++
+
+    .. link-button:: roadmap
+            :type: ref
+            :text: Roadmap
+            :classes: btn-block btn-secondary stretched-link
+
+    ---
+
+    Code of Conduct
+    ^^^^^^^^^^^^^^^
+
+
+    +++
+
+    .. link-button:: code_of_conduct
+            :type: ref
+            :text: Code of Conduct
+            :classes: btn-block btn-secondary stretched-link
+
+    ---
+
+    Governance
+    ^^^^^^^^^^
+
+    How ``sktime`` is run.
+
+    +++
+
+    .. link-button:: governance
+            :type: ref
+            :text: Governance
+            :classes: btn-block btn-secondary stretched-link
diff --git a/docs/source/enhancement_proposals.rst b/docs/source/enhancement_proposals.rst
new file mode 100644
index 00000000000..a2052b319af
--- /dev/null
+++ b/docs/source/enhancement_proposals.rst
@@ -0,0 +1,6 @@
+.. _enhancement_proposals:
+
+Enhancement Proposals
+=====================
+
+Please visit our GitHub repository for `sktime enhancement proposals <https://github.com/sktime/enhancement-proposals>`_.
diff --git a/docs/source/estimator_overview.md b/docs/source/estimator_overview.md
new file mode 100644
index 00000000000..0dd4f557a4b
--- /dev/null
+++ b/docs/source/estimator_overview.md
@@ -0,0 +1,11 @@
+# Estimator Overview
+
+The table below gives an overview of all estimators in sktime.
+
+<p>
+<label for="myInput"></label><input type="text" id="myInput" placeholder="Search the table ..." />
+<br>
+</p>
+
+```{include} estimator_overview_table.md
+```
diff --git a/docs/source/estimator_overview.rst b/docs/source/estimator_overview.rst
deleted file mode 100644
index 794c8f87c44..00000000000
--- a/docs/source/estimator_overview.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-.. _estimator_overview:
-
-==================
-Estimator Overview
-==================
-
-The table below gives an overview of all estimators in sktime.
-
-.. raw:: html
-
-    <label for="myInput"></label><input type="text" id="myInput" placeholder="Search for names.." />
-    <br>
-
-.. mdinclude:: estimator_overview_table.md
diff --git a/docs/source/get_involved.rst b/docs/source/get_involved.rst
new file mode 100644
index 00000000000..09796dc87d5
--- /dev/null
+++ b/docs/source/get_involved.rst
@@ -0,0 +1,73 @@
+.. _get_involved:
+
+Get Involved
+============
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+
+   get_involved/contributing
+   get_involved/mentoring
+   get_involved/meetups
+
+sktime is a community-driven project and your help is extremely welcome.
+If you get stuck, please don’t hesitate to chat with us or raise an issue.
+
+.. panels::
+    :card: + intro-card text-center
+
+    ---
+
+    Contributing
+    ^^^^^^^^^^^^
+
+    New to sktime? Check out the contributing guide.
+
+    +++
+
+    .. link-button:: contributing
+            :type: ref
+            :text: Contributing guide
+            :classes: btn-block btn-secondary stretched-link
+
+    ---
+
+    Mentoring
+    ^^^^^^^^^
+
+    New to open source? Apply to our mentoring program!
+
+    +++
+
+    .. link-button:: mentoring
+            :type: ref
+            :text: Mentoring
+            :classes: btn-block btn-secondary stretched-link
+
+    ---
+
+    Sponsoring
+    ^^^^^^^^^^
+
+    Fund sktime maintenance and development.
+
+    +++
+
+    .. link-button:: https://opencollective.com/sktime
+            :text: Donate
+            :classes: btn-block btn-secondary stretched-link
+
+    ---
+
+    Meet-ups
+    ^^^^^^^^
+
+    Join our discussions, tutorials, workshops and sprints!
+
+    +++
+
+    .. link-button:: meetups
+            :type: ref
+            :text: Participate
+            :classes: btn-block btn-secondary stretched-link
diff --git a/docs/source/get_involved/contributing.md b/docs/source/get_involved/contributing.md
new file mode 100644
index 00000000000..004f419c741
--- /dev/null
+++ b/docs/source/get_involved/contributing.md
@@ -0,0 +1,2 @@
+```{include} ../../../CONTRIBUTING.md
+```
diff --git a/docs/source/get_involved/meetups.rst b/docs/source/get_involved/meetups.rst
new file mode 100644
index 00000000000..dd6dfb11bc5
--- /dev/null
+++ b/docs/source/get_involved/meetups.rst
@@ -0,0 +1,13 @@
+.. _meetups:
+
+Community meetups
+=================
+
+Join our discussions, tutorials, workshops and sprints!
+
+Most of our meetups take place on Discord. You can join sktime's community
+server `here <https://discord.com/invite/gqSab2K>`_.
+
+.. raw:: html
+
+    <iframe src="https://calendar.google.com/calendar/embed?height=600&wkst=2&bgcolor=%23ffffff&ctz=UTC&src=c2t0aW1lLnRvb2xib3hAZ21haWwuY29t&color=%23039BE5&showCalendars=0&showTabs=1&showPrint=0&showDate=1&showNav=1&showTitle=1&title=sktime%20community%20calendar&mode=AGENDA" style="border-width:0" width="600" height="600" frameborder="0" scrolling="no"></iframe>
diff --git a/docs/source/mentoring.rst b/docs/source/get_involved/mentoring.rst
similarity index 100%
rename from docs/source/mentoring.rst
rename to docs/source/get_involved/mentoring.rst
diff --git a/docs/source/get_started.rst b/docs/source/get_started.rst
new file mode 100644
index 00000000000..7fc0934f2e6
--- /dev/null
+++ b/docs/source/get_started.rst
@@ -0,0 +1,181 @@
+.. _get_started:
+
+===========
+Get Started
+===========
+
+The following information is designed to get users up and running with ``sktime`` quickly. For more detailed information, see the links in each of the subsections.
+
+Installation
+------------
+
+``sktime`` currently supports:
+
+* environments with python version 3.6, 3.7, or 3.8.
+* operating systems Mac OS X, Unix-like OS, Windows 8.1 and higher
+* installation via ``PyPi`` or ``conda``
+
+To install ``sktime`` with its core dependencies via ``pip`` use:
+
+.. code-block:: bash
+
+    pip install sktime
+
+To install ``sktime`` via ``pip`` with maximum dependencies, including soft dependencies, install using the `all_extras` modifier:
+
+.. code-block:: bash
+
+    pip install sktime[all_extras]
+
+
+To install ``sktime`` with its core dependencies via ``conda`` from ``conda-forge`` use:
+
+.. code-block:: bash
+
+    conda install -c conda-forge sktime
+
+To install ``sktime`` via ``conda`` with maximum dependencies, including soft dependencies, install using the `all-extras` conda recipe:
+
+.. code-block:: bash
+
+    conda install -c conda-forge sktime-all-extras
+
+Key Concepts
+------------
+
+``sktime`` seeks to provide a unified framework for multiple time series machine learning tasks. This (hopefully) makes ``sktime's`` functionality intuitive for users
+and lets developers extend the framework more easily. But time series data and the related scientific use cases each can take multiple forms.
+Therefore, a key set of common concepts and terminology is important.
+
+Data Types
+~~~~~~~~~~
+
+``sktime`` is designed for time series machine learning. Time series data refers to data where the variables are ordered over time or
+an index indicating the position of an observation in the sequence of values.
+
+In ``sktime`` time series data can refer to data that is univariate, multivariate or panel, with the difference relating to the number and interrelation
+between time series :term:`variables <variable>`, as well as the number of :term:`instances <instance>` for which each variable is observed.
+
+- :term:`Univariate time series` data refers to data where a single :term:`variable` is tracked over time.
+- :term:`Multivariate time series` data refers to data where multiple :term:`variables <variable>` are tracked over time for the same :term:`instance`. For example, multiple quarterly economic indicators for a country or multiple sensor readings from the same machine.
+- :term:`Panel time series` data refers to data where the variables (univariate or multivariate) are tracked for multiple :term:`instances <instance>`. For example, multiple quarterly economic indicators for several countries or multiple sensor readings for multiple machines.
+
+Learning Tasks
+~~~~~~~~~~~~~~
+
+``sktime's`` functionality for each learning tasks is centered around providing a set of code artifacts that match a common interface to a given
+scientific purpose (i.e. :term:`scientific type` or :term:`scitype`). For example, ``sktime`` includes a common interface for "forecaster" classes designed to predict future values
+of a time series.
+
+``sktime's`` interface currently supports:
+
+- :term:`Time series classification` where the time series data for a given instance are used to predict a categorical target class.
+- :term:`Time series regression` where the time series data for a given instance are used to predict a continuous target value.
+- :term:`Time series clustering` where the goal is to discover groups consisting of instances with similar time series.
+- :term:`Forecasting` where the goal is to predict future values of the input series.
+- :term:`Time series annotation` which is focused on outlier detection, anomaly detection, change point detection and segmentation.
+
+Reduction
+~~~~~~~~~
+
+While the list above presents each learning task separately, in many cases it is possible to adapt one learning task to help solve another related learning task. For example,
+one approach to forecasting would be to use a regression model that explicitly accounts for the data's time dimension. However, another approach is to reduce the forecasting problem
+to cross-sectional regression, where the input data are tabularized and lags of the data are treated as independent features in `scikit-learn` style
+tabular regression algorithms. Likewise one approach to the time series annotation task like anomaly detection is to reduce the problem to using forecaster to predict future values and flag
+observations that are too far from these predictions as anomalies. ``sktime`` typically incorporates these type of :term:`reductions <reduction>` through the use of composable classes that
+let users adapt one learning task to solve another related one.
+
+For more information on ``sktime's`` terminology and functionality see the :ref:`glossary` and the :ref:`user guide <user_guide>`.
+
+Quickstart
+----------
+The code snippets below are designed to introduce ``sktime's`` functionality so you can start using its functionality quickly. For more detailed information see the :ref:`tutorials`,  :ref:`user_guide` and :ref:`api_reference` in ``sktime's`` :ref:`user_documentation`.
+
+Forecasting
+~~~~~~~~~~~
+
+.. code-block:: python
+
+    from sktime.datasets import load_airline
+    from sktime.forecasting.base import ForecastingHorizon
+    from sktime.forecasting.model_selection import temporal_train_test_split
+    from sktime.forecasting.theta import ThetaForecaster
+    from sktime.performance_metrics.forecasting import mean_absolute_percentage_error
+
+    y = load_airline()
+    y_train, y_test = temporal_train_test_split(y)
+    fh = ForecastingHorizon(y_test.index, is_relative=False)
+    forecaster = ThetaForecaster(sp=12)  # monthly seasonal periodicity
+    forecaster.fit(y_train)
+    y_pred = forecaster.predict(fh)
+    mean_absolute_percentage_error(y_test, y_pred)
+    >>> 0.08661467738190656
+
+Time Series Classification
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+    from sktime.classification.interval_based import TimeSeriesForestClassifier
+    from sktime.datasets import load_arrow_head
+    from sklearn.model_selection import train_test_split
+    from sklearn.metrics import accuracy_score
+
+    X, y = load_arrow_head(return_X_y=True)
+    X_train, X_test, y_train, y_test = train_test_split(X, y)
+    classifier = TimeSeriesForestClassifier()
+    classifier.fit(X_train, y_train)
+    y_pred = classifier.predict(X_test)
+    accuracy_score(y_test, y_pred)
+    >>> 0.8679245283018868
+
+Time Series Regression
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. note::
+    The time series regression API is stable. But the inclusion of a dataset to illustrate
+    its features is still in progress.
+
+.. code-block:: python
+    from sktime.regression.compose import ComposableTimeSeriesForestRegressor
+
+Time Series Clustering
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. warning::
+
+   The time series clustering API is still experimental. Features may change
+   in future releases.
+
+.. code-block:: python
+
+    from sklearn.model_selection import train_test_split
+    from sktime.clustering import TimeSeriesKMeans
+    from sktime.clustering.evaluation._plot_clustering import plot_cluster_algorithm
+    from sktime.datasets import load_arrow_head
+
+    X, y = load_arrow_head(return_X_y=True)
+    X_train, X_test, y_train, y_test = train_test_split(X, y)
+
+    k_means = TimeSeriesKMeans(n_clusters=5, init_algorithm="forgy", metric="dtw")
+    k_means.fit(X_train)
+    plot_cluster_algorithm(k_means, X_test, k_means.n_clusters)
+
+Time Series Annotation
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. warning::
+
+   The time series annotation API is still experimental. Features may change
+   in future releases.
+
+.. code-block:: python
+
+    from sktime.annotation.adapters import PyODAnnotator
+    from pyod.models.iforest import IForest
+    from sktime.datasets import load_airline
+    y = load_airline()
+    pyod_model = IForest()
+    pyod_sktime_annotator = PyODAnnotator(pyod_model)
+    pyod_sktime_annotator.fit(y)
+    annotated_series = pyod_sktime_annotator.predict(y)
diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst
new file mode 100644
index 00000000000..e310b14bda7
--- /dev/null
+++ b/docs/source/glossary.rst
@@ -0,0 +1,80 @@
+.. _glossary:
+
+Glossary of Common Terms
+========================
+
+The glossary below defines common terms and API elements used throughout
+sktime.
+
+.. note::
+
+    The glossary is under development. Important terms are still missing.
+    Please create a pull request if you want to add one.
+
+
+.. glossary::
+    :sorted:
+
+    Scitype
+        See :term:`scientific type`.
+
+    Scientific type
+        A class or object type to denote a category of objects defined by a
+        common interface and data scientific purpose. For example, "forecaster"
+        or "classifier".
+
+    Forecasting
+        A learning task focused on prediction future values of a time series. For more details, see the :ref:`user_guide_forecasting`.
+
+    Time series
+         Data where the :term:`variable` measurements are ordered over time or an index indicating the position of an observation in the sequence of values.
+
+    Time series classification
+        A learning task focused on using the patterns across instances between the time series and a categorical target variable.
+
+    Time series regression
+        A learning task focused on using the patterns across instances between the time series and a continuous target variable.
+
+    Time series clustering
+        A learning task focused on discovering groups consisting of instances with similar time series.
+
+    Time series annotation
+        A learning task focused on labeling the timepoints of a time series. This includes the related tasks of outlier detection, anomaly detection, change point detection and segmentation.
+
+    Panel time series
+        A form of time series data where the same time series are observed observed for multiple observational units. The observed series may consist of :term:`univariate time series` or
+        :term:`multivariate time series`. Accordingly, the data varies across time, observational unit and series (i.e. variables).
+
+    Univariate time series
+        A single time series. While univariate analysis often only uses information contained in the series itself,
+        univariate time series regression and forecasting can also include :term:`exogenous` data.
+
+    Multivariate time series
+        Multiple time series. Typically observed for the same observational unit. Multivariate time series
+        is typically used to refer to cases where the series evolve together over time. This is related, but different than the cases where
+        a :term:`univariate time series` is dependent on :term:`exogenous` data.
+
+    Endogenous
+        Within a learning task endogenous variables are determined by exogenous variables or past timepoints of the variable itself. Also referred to
+        as the dependent variable or target.
+
+    Exogenous
+        Within a learning task exogenous variables are external factors whose pattern of impact on tasks' endogenous variables must be learned.
+        Also referred to as independent variables or features.
+
+    Reduction
+        Reduction refers to decomposing a given learning task into simpler tasks that can be composed to create a solution to the original task.
+        In ``sktime`` reduction is used to allow one learning task to be adapted as a solution for an alternative task.
+
+    Variable
+        Refers to some measurement of in terest. Variables may be cross-sectional (e.g. time-invarient measurements like a patient's place of birth) or
+        :term:`time series`.
+
+    Timepoint
+        The point in time that an observation is made. A timee point may represent an exact point in time (a timestamp),
+        a timeperiod (e.g. minutes, hours or days), or simply an index indicating the position of an observation in the sequence of values.
+
+    Instance
+        A member of the set of entities being studied and which an ML practitioner wishes to generalize. For example,
+        patients, chemical process runs, machines, countries, etc. May also be referred to as samples, examples, observations or records
+        depending on the discipline and context.
diff --git a/docs/source/images/sktime-logo-text-horizontal.png b/docs/source/images/sktime-logo-text-horizontal.png
new file mode 100644
index 00000000000..070bb31a76e
Binary files /dev/null and b/docs/source/images/sktime-logo-text-horizontal.png differ
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 7419b7b7683..e377849cba8 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,7 +1,4 @@
-.. sktime documentation main file, created by
-   sphinx-quickstart on Sat Jan 26 21:22:05 2019.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
+.. _home:
 
 Welcome to sktime
 =================
@@ -10,37 +7,104 @@ Welcome to sktime
 
 sktime provides time series algorithms and scikit-learn compatible tools to build, tune and validate time series models. sktime provides a unified interface for multiple time series learning tasks, including time series classification, regression, clustering, annotation and forecasting.
 
-Navigation
-----------
 
-From here, you can navigate to:
 
 .. toctree::
-   :caption: Users
    :maxdepth: 1
+   :hidden:
 
-   installation
-   tutorials
-   user_guide
-   estimator_overview
+   get_started
+   users
    api_reference
-   changelog
-   roadmap
-   related_software
+   get_involved
+   developers
+   about
 
-.. toctree::
-   :caption: Developers
-   :maxdepth: 1
+From here, you can navigate to:
 
-   contributing
-   developer_guide
-   mentoring
-   code_of_conduct
-   governance
+.. panels::
+    :card: + intro-card text-center
 
-.. toctree::
-   :caption: About
-   :maxdepth: 1
+    ---
 
-   about
-   contributors
+    Get Started
+    ^^^^^^^^^^^
+
+    Get started using ``sktime`` quickly.
+
+    +++
+
+    .. link-button:: get_started
+            :type: ref
+            :text: Get Started
+            :classes: btn-block btn-secondary stretched-link
+
+    ---
+
+    Users
+    ^^^^^
+
+    Find user documentation.
+
+    +++
+
+    .. link-button:: users
+            :type: ref
+            :text: Users
+            :classes: btn-block btn-secondary stretched-link
+
+    ---
+
+    API Reference
+    ^^^^^^^^^^^^^
+
+    Understand sktime's API.
+
+    +++
+
+    .. link-button:: api_reference
+            :type: ref
+            :text: API Reference
+            :classes: btn-block btn-secondary stretched-link
+
+    ---
+
+    Get Involved
+    ^^^^^^^^^^^^
+
+    Find out how you can contribute.
+
+    +++
+
+    .. link-button:: get_involved
+            :type: ref
+            :text: Get Involved
+            :classes: btn-block btn-secondary stretched-link
+
+    ---
+
+    Developers
+    ^^^^^^^^^^
+
+    Information for developers.
+
+    +++
+
+    .. link-button:: developers
+            :type: ref
+            :text: Developers
+            :classes: btn-block btn-secondary stretched-link
+
+    ---
+
+    About
+    ^^^^^
+
+    Learn more about ``sktime``.
+
+    +++
+
+    .. link-button:: about
+            :type: ref
+            :text: Learn More
+            :classes: btn-block btn-secondary stretched-link
diff --git a/docs/source/reviewer_guide.rst b/docs/source/reviewer_guide.rst
new file mode 100644
index 00000000000..997968d05e2
--- /dev/null
+++ b/docs/source/reviewer_guide.rst
@@ -0,0 +1,38 @@
+.. _reviewer_guide:
+
+==============
+Reviewer Guide
+==============
+
+.. warning::
+
+    The reviewer guide is under development.
+
+
+Triage
+======
+
+* Assign relevant labels
+* Assign to relevant project board
+* Title: Is it using the 3-letter codes? Is it understandable?
+* Description: Is it understandable? Any related issues/PRs?
+* CI checks: approval for first-time contributors, any help needed with
+code/doc quality checks?
+* Merge conflicts
+
+Code
+====
+
+* Unit testing: Are the code changes tested? Are the unit tests understandable?
+* Test code locally: Does everything work as expected?
+* Deprecation warnings: Has the public API changed? Have deprecation
+warnings been added before making the changes?
+
+Documenation
+============
+
+* Are the docstrings complete? Are they understandable?
+* Do they follow the NumPy format and sktime conventions?
+* Does the online documentation render correctly with the changes?
+* Could we add links to relevant topics in the :ref:`glossary` or
+:ref:`user_guide`?
diff --git a/docs/source/roadmap.rst b/docs/source/roadmap.rst
index c636c0af60c..d66d2f72126 100644
--- a/docs/source/roadmap.rst
+++ b/docs/source/roadmap.rst
@@ -1,26 +1,28 @@
 .. _roadmap:
 
+=======
 Roadmap
 =======
 
+Welcome to sktime's roadmap.
+
 Contributors: @mloning, @fkiraly, @sveameyer13, @lovkush-a, @bilal-196, @GuzalBulatova, @chrisholder, @satya-pattnaik, @aiwalter
 
 Created during the 2021 sktime dev days, 25/06/2021.
 
----
-
-Welcome to sktime's roadmap.
+----
 
 Project aims
 ------------
 The aim of sktime is to:
+
 * Develop a unified framework for machine learning with time series in Python
 * Advance research on algorithm development and software design for machine learning toolboxes
 * Build a more connected community of researchers and domain experts who work with time series
 * Create and deliver educational material including documentation and user guides
 
-Work packages
--------------
+Work streams
+------------
 
 Documentation
 ~~~~~~~~~~~~~
@@ -56,8 +58,8 @@ Refactoring and extending existing modules
 * Series annotation
     * implement more estimators for outlier anomaly/detection and segmentation
 
-New modules and algorithms
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+Adding new modules and algorithms
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 * Panel annotation
 * Probabilistic interface, event modelling(time-to-event modeling, survival analysis)
 * Panel & supervised forecasting
diff --git a/docs/source/tutorials.rst b/docs/source/tutorials.rst
index 528cb2ddbd8..e0cf7d94e6e 100644
--- a/docs/source/tutorials.rst
+++ b/docs/source/tutorials.rst
@@ -3,40 +3,18 @@
 Tutorials
 =========
 
-.. |binder| image:: https://mybinder.org/badge_logo.svg
-.. _Binder: https://mybinder.org/v2/gh/alan-turing-institute/sktime/main?filepath=examples/
-
 We have a number of tutorial notebooks. If you like to run them interactively, you can launch them on Binder without
 having to install anything.
 
-|Binder|_
-
 We assume basic familiarity with `scikit-learn`_. If you haven’t worked with scikit-learn before, check out their
 `getting-started guide`_.
 
-.. toctree::
-    :maxdepth: 1
-
-    examples/01_forecasting.ipynb
-    examples/01a_forecasting_sklearn.ipynb
-    examples/02_classification_univariate.ipynb
-    examples/03_classification_multivariate.ipynb
-    examples/04_benchmarking.ipynb
-    examples/feature_extraction_with_tsfresh.ipynb
-    examples/shapelet_transform.ipynb
-    examples/interval_based_classification.ipynb
-    examples/catch22.ipynb
-    examples/mrseql.ipynb
-    examples/rocket.ipynb
-    examples/minirocket.ipynb
-    examples/plateau_finder.ipynb
-    examples/loading_data.ipynb
-    examples/forecasting_with_hcrystalball.ipynb
-    examples/dictionary_based_classification.ipynb
-    examples/interpolation.ipynb
-    examples/window_splitter.ipynb
+The notebook files can be found `here <https://github.com/alan-turing-institute/sktime/blob/main/examples>`_.
 
 .. _scikit-learn: https://scikit-learn.org/stable/
 .. _getting-started guide: https://scikit-learn.org/stable/getting_started.html
 
-The notebook files can be found `here <https://github.com/alan-turing-institute/sktime/blob/main/examples>`_.
+.. nbgallery::
+    :glob:
+
+    examples/*
diff --git a/docs/source/user_guide.rst b/docs/source/user_guide.rst
index 228476f03e6..3fa729c618c 100644
--- a/docs/source/user_guide.rst
+++ b/docs/source/user_guide.rst
@@ -29,4 +29,7 @@ meant to provide a technical manual.
    user_guide/classification
    user_guide/regression
    user_guide/forecasting
+   user_guide/clustering
+   user_guide/annotation
+   user_guide/performance_metrics
    user_guide/resources
diff --git a/docs/source/user_guide/annotation.rst b/docs/source/user_guide/annotation.rst
new file mode 100644
index 00000000000..2bb1d03f68b
--- /dev/null
+++ b/docs/source/user_guide/annotation.rst
@@ -0,0 +1,11 @@
+.. _user_guide_annotation:
+
+Annotation
+==========
+
+.. note::
+
+    The user guide is under development. We have created a basic
+    structure and are looking for contributions to develop the user guide
+    further. For more details, please go to issue `#361 <https://github
+    .com/alan-turing-institute/sktime/issues/361>`_ on GitHub.
diff --git a/docs/source/user_guide/clustering.rst b/docs/source/user_guide/clustering.rst
new file mode 100644
index 00000000000..14a73bf71a7
--- /dev/null
+++ b/docs/source/user_guide/clustering.rst
@@ -0,0 +1,11 @@
+.. _user_guide_clustering:
+
+Clustering
+==========
+
+.. note::
+
+    The user guide is under development. We have created a basic
+    structure and are looking for contributions to develop the user guide
+    further. For more details, please go to issue `#361 <https://github
+    .com/alan-turing-institute/sktime/issues/361>`_ on GitHub.
diff --git a/docs/source/user_guide/learning_tasks.rst b/docs/source/user_guide/learning_tasks.rst
index b4e2af41962..f33962837ee 100644
--- a/docs/source/user_guide/learning_tasks.rst
+++ b/docs/source/user_guide/learning_tasks.rst
@@ -9,15 +9,3 @@ Overview of Learning Tasks
     structure and are looking for contributions to develop the user guide
     further. For more details, please go to issue `#361 <https://github
     .com/alan-turing-institute/sktime/issues/361>`_ on GitHub.
-
-Contents
---------
-
-* :ref:`classification`
-* :ref:`regression`
-* :ref:`forecasting`
-* Clustering
-* Time series annotation
-* Panel forecasting
-* Supervised forecasting
-* Reduction
diff --git a/docs/source/user_guide/performance_metrics.rst b/docs/source/user_guide/performance_metrics.rst
new file mode 100644
index 00000000000..8cb5de28c83
--- /dev/null
+++ b/docs/source/user_guide/performance_metrics.rst
@@ -0,0 +1,15 @@
+.. _user_guide_performance_metrics:
+
+Performance Metrics
+===================
+
+.. note::
+
+    The user guide is under development. We have created a basic
+    structure and are looking for contributions to develop the user guide
+    further. For more details, please go to issue `#361 <https://github
+    .com/alan-turing-institute/sktime/issues/361>`_ on GitHub.
+
+
+Forecasting
+-----------
diff --git a/docs/source/users.rst b/docs/source/users.rst
new file mode 100644
index 00000000000..71fa36e20ec
--- /dev/null
+++ b/docs/source/users.rst
@@ -0,0 +1,15 @@
+.. _user_documentation:
+
+Documentation
+=============
+
+.. toctree::
+   :maxdepth: 1
+
+   installation
+   tutorials
+   user_guide
+   estimator_overview
+   glossary
+   changelog
+   related_software
diff --git a/sktime/annotation/__init__.py b/sktime/annotation/__init__.py
index e69de29bb2d..e21bd818606 100644
--- a/sktime/annotation/__init__.py
+++ b/sktime/annotation/__init__.py
@@ -0,0 +1,2 @@
+# -*- coding: utf-8 -*-
+"""Implements time series annotation."""
diff --git a/sktime/annotation/adapters/__init__.py b/sktime/annotation/adapters/__init__.py
index 564d7233fb7..dca93c8455f 100644
--- a/sktime/annotation/adapters/__init__.py
+++ b/sktime/annotation/adapters/__init__.py
@@ -1,4 +1,8 @@
+#!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements adapters for time series annotation."""
+
 __all__ = ["PyODAnnotator"]
 
 from sktime.annotation.adapters._pyod import PyODAnnotator
diff --git a/sktime/annotation/adapters/_pyod.py b/sktime/annotation/adapters/_pyod.py
index 60131e35160..9a529de0801 100644
--- a/sktime/annotation/adapters/_pyod.py
+++ b/sktime/annotation/adapters/_pyod.py
@@ -1,4 +1,8 @@
+#!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements outlier detection from pyOD."""
+
 import numpy as np
 from sktime.annotation.base._base import BaseSeriesAnnotator
 
@@ -10,7 +14,7 @@
 
 
 class PyODAnnotator(BaseSeriesAnnotator):
-    """Transformer that applies outlier detector from pyOD
+    """Transformer that applies outlier detector from pyOD.
 
     Parameters
     ----------
@@ -47,11 +51,10 @@ def _fit(self, X, Y=None):
         -------
         self : returns a reference to self
 
-        State change
-        ------------
-        creates fitted model (attributes ending in "_")
+        Notes
+        -----
+        Create fitted model that sets attributes ending in "_".
         """
-
         X_np = X.to_numpy()
 
         if len(X_np.shape) == 1:
@@ -74,7 +77,6 @@ def _predict(self, X):
         Y : pd.Series - annotations for sequence X
             exact format depends on annotation type
         """
-
         fmt = self.fmt
         labels = self.labels
 
diff --git a/sktime/annotation/base/__init__.py b/sktime/annotation/base/__init__.py
index 3da286f8f7f..1b9d1b0ffce 100644
--- a/sktime/annotation/base/__init__.py
+++ b/sktime/annotation/base/__init__.py
@@ -1,4 +1,8 @@
+#!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements base classes for annotation in sktime."""
+
 __all__ = ["BaseSeriesAnnotator"]
 
 from sktime.annotation.base._base import BaseSeriesAnnotator
diff --git a/sktime/annotation/base/_base.py b/sktime/annotation/base/_base.py
index 35ed5d99581..82f85992c1b 100644
--- a/sktime/annotation/base/_base.py
+++ b/sktime/annotation/base/_base.py
@@ -1,6 +1,8 @@
+#!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
 """
-Base class template for annotator base type for time series stream
+Base class template for annotator base type for time series stream.
 
     class name: BaseSeriesAnnotator
 
@@ -48,7 +50,7 @@ class BaseSeriesAnnotator(BaseEstimator):
     Notes
     -----
     Assumes "predict" data is temporal future of "fit"
-        single time series in both, no meta-data
+    Single time series in both, no meta-data.
 
     The base series annotator specifies the methods and method
     signatures that all annotators have to implement.
@@ -74,17 +76,19 @@ def fit(self, X, Y=None):
         Parameters
         ----------
         X : pd.DataFrame
-            training data to fit model to, time series
+            Training data to fit model to (time series).
         Y : pd.Series, optional
-            ground truth annotations for training if annotator is supervised
+            Ground truth annotations for training if annotator is supervised.
+
         Returns
         -------
-        self : returns a reference to self
+        self :
+            Reference to self.
 
-        State change
-        ------------
-        creates fitted model (attributes ending in "_")
-        sets _is_fitted flag to true
+        Notes
+        -----
+        Creates fitted model that updates attributes ending in "_". Sets
+        _is_fitted flag to True.
         """
         check_labels(self.labels)
         check_fmt(self.fmt)
@@ -110,14 +114,14 @@ def predict(self, X):
 
         Parameters
         ----------
-        X : pd.DataFrame - data to annotate, time series
+        X : pd.DataFrame
+            Data to annotate (time series).
 
         Returns
         -------
-        Y : pd.Series - annotations for sequence X
-            exact format depends on annotation type
+        Y : pd.Series
+            Annotations for sequence X exact format depends on annotation type.
         """
-
         self.check_is_fitted()
 
         X = check_series(X)
@@ -129,23 +133,24 @@ def predict(self, X):
         return Y
 
     def update(self, X, Y=None):
-        """update model with new data and optional ground truth annotations
+        """Update model with new data and optional ground truth annotations.
 
         Parameters
         ----------
         X : pd.DataFrame
-            training data to update model with, time series
+            Training data to update model with (time series).
         Y : pd.Series, optional
-            ground truth annotations for training if annotator is supervised
+            Ground truth annotations for training if annotator is supervised.
+
         Returns
         -------
-        self : returns a reference to self
+        self :
+            Reference to self.
 
-        State change
-        ------------
-        updates fitted model (attributes ending in "_")
+        Notes
+        -----
+        Updates fitted model that updates attributes ending in "_".
         """
-
         self.check_is_fitted()
 
         X = check_series(X)
@@ -163,23 +168,22 @@ def update(self, X, Y=None):
         return self
 
     def update_predict(self, X):
-        """update model with new data and create annotations for it
+        """Update model with new data and create annotations for it.
 
         Parameters
         ----------
         X : pd.DataFrame
-            training data to update model with, time series
+            Training data to update model with, time series.
 
         Returns
         -------
-        Y : pd.Series - annotations for sequence X
-            exact format depends on annotation type
+        Y : pd.Series
+            Annotations for sequence X exact format depends on annotation type.
 
-        State change
-        ------------
-        updates fitted model (attributes ending in "_")
+        Notes
+        -----
+        Updates fitted model that updates attributes ending in "_".
         """
-
         X = check_series(X)
 
         self.update(X=X)
@@ -195,16 +199,18 @@ def _fit(self, X, Y=None):
         Parameters
         ----------
         X : pd.DataFrame
-            training data to fit model to, time series
+            Training data to fit model to time series.
         Y : pd.Series, optional
-            ground truth annotations for training if annotator is supervised
+            Ground truth annotations for training if annotator is supervised.
+
         Returns
         -------
-        self : returns a reference to self
+        self :
+            Reference to self.
 
-        State change
-        ------------
-        creates fitted model (attributes ending in "_")
+        Notes
+        -----
+        Updates fitted model that updates attributes ending in "_".
         """
         raise NotImplementedError("abstract method")
 
@@ -215,35 +221,37 @@ def _predict(self, X):
 
         Parameters
         ----------
-        X : pd.DataFrame - data to annotate, time series
+        X : pd.DataFrame
+            Data to annotate, time series.
 
         Returns
         -------
-        Y : pd.Series - annotations for sequence X
-            exact format depends on annotation type
+        Y : pd.Series
+            Annotations for sequence X exact format depends on annotation type.
         """
         raise NotImplementedError("abstract method")
 
     def _update(self, X, Y=None):
-        """update model with new data and optional ground truth annotations
+        """Update model with new data and optional ground truth annotations.
 
         core logic
 
         Parameters
         ----------
         X : pd.DataFrame
-            training data to update model with, time series
+            Training data to update model with time series
         Y : pd.Series, optional
-            ground truth annotations for training if annotator is supervised
+            Ground truth annotations for training if annotator is supervised.
+
         Returns
         -------
-        self : returns a reference to self
+        self :
+            Reference to self.
 
-        State change
-        ------------
-        updates fitted model (attributes ending in "_")
+        Notes
+        -----
+        Updates fitted model that updates attributes ending in "_".
         """
-
         # default/fallback: re-fit to all data
         self._fit(self._X, self._Y)
 
diff --git a/sktime/base/_base.py b/sktime/base/_base.py
index deba7789466..95e2cd7c813 100644
--- a/sktime/base/_base.py
+++ b/sktime/base/_base.py
@@ -72,9 +72,10 @@ def get_class_tags(cls):
 
         Returns
         -------
-        collected_tags : dictionary of tag names : tag values
-            collected from _tags class attribute via nested inheritance
-            NOT overridden by dynamic tags set by set_tags or mirror_tags
+        collected_tags : dict
+            Dictionary of tag name : tag value pairs. Collected from _tags
+            class attribute via nested inheritance. NOT overridden by dynamic
+            tags set by set_tags or mirror_tags.
         """
         collected_tags = dict()
 
@@ -96,13 +97,16 @@ def get_class_tag(cls, tag_name, tag_value_default=None):
 
         Parameters
         ----------
-        tag_name : str, name of tag value
-        tag_value_default : any type, default/fallback value if tag is not found
+        tag_name : str
+            Name of tag value.
+        tag_value_default : any type
+            Default/fallback value if tag is not found.
 
         Returns
         -------
-        tag_value : value of the tag tag_name in self if found
-                    if tag is not found, returns tag_value_default
+        tag_value :
+            Value of the `tag_name` tag in self. If not found, returns
+            `tag_value_default`.
         """
         collected_tags = cls.get_class_tags()
 
@@ -113,9 +117,10 @@ def get_tags(self):
 
         Returns
         -------
-        collected_tags : dictionary of tag names : tag values
-            collected from _tags class attribute via nested inheritance
-            then any overrides and new tags from _tags_dynamic object attribute
+        collected_tags : dict
+            Dictionary of tag name : tag value pairs. Collected from _tags
+            class attribute via nested inheritance and then any overrides
+            and new tags from _tags_dynamic object attribute.
         """
         collected_tags = self.get_class_tags()
 
@@ -129,13 +134,16 @@ def get_tag(self, tag_name, tag_value_default=None):
 
         Parameters
         ----------
-        tag_name : str, name of tag value
-        tag_value_default : any type, default/fallback value if tag is not found
+        tag_name : str
+            Name of tag value.
+        tag_value_default : any type
+            Default/fallback value if tag is not found.
 
         Returns
         -------
-        tag_value : value of the tag tag_name in self if found
-                    if tag is not found, returns tag_value_default
+        tag_value :
+            Value of the `tag_name` tag in self. If not found, returns
+            `tag_value_default`.
         """
         collected_tags = self.get_tags()
 
@@ -146,15 +154,18 @@ def set_tags(self, **tag_dict):
 
         Parameters
         ----------
-        tag_dict : dictionary of tag names : tag values
+        tag_dict : dict
+            Dictionary of tag name : tag value pairs.
 
         Returns
         -------
-        reference to self
+        Self :
+            Reference to self.
 
-        State change
-        ------------
-        sets tag values in tag_dict as dynamic tags in self
+        Notes
+        -----
+        Changes object state by settting tag values in tag_dict as dynamic tags
+        in self.
         """
         self._tags_dynamic.update(deepcopy(tag_dict))
 
@@ -165,17 +176,20 @@ def clone_tags(self, estimator, tag_names=None):
 
         Parameters
         ----------
-        estimator : an estimator inheriting from BaseEstimator
-        tag_names : list of str, or str; names of tags to clone
-            default = list of all tags in estimator
+        estimator : estimator inheriting from :class:BaseEstimator
+        tag_names : str or list of str, default = None
+            Names of tags to clone. If None then all tags in estimator are used
+            as `tag_names`.
 
         Returns
         -------
-        reference to self
+        Self :
+            Reference to self.
 
-        State change
-        ------------
-        sets tag values in tag_set from estimator as dynamic tags in self
+        Notes
+        -----
+        Changes object state by setting tag values in tag_set from estimator as
+        dynamic tags in self.
         """
         tags_est = deepcopy(estimator.get_tags())
 
diff --git a/sktime/base/_meta.py b/sktime/base/_meta.py
index 8173c3014a7..73885dcc340 100644
--- a/sktime/base/_meta.py
+++ b/sktime/base/_meta.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements meta estimator for estimators composed of other estimators."""
 
 __author__ = ["Markus Löning"]
 __all__ = ["_HeterogenousMetaEstimator"]
diff --git a/sktime/benchmarking/evaluation.py b/sktime/benchmarking/evaluation.py
index 37756e4f5fb..68028e7853f 100644
--- a/sktime/benchmarking/evaluation.py
+++ b/sktime/benchmarking/evaluation.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+"""Evaluator class for analyzing results of a machine learning experiment."""
 __author__ = ["Viktor Kazakov", "Markus Löning", "Aaron Bostrom"]
 __all__ = ["Evaluator"]
 
@@ -21,9 +22,7 @@
 
 
 class Evaluator:
-    """
-    Analyze results of machine learning experiments.
-    """
+    """Analyze results of machine learning experiments."""
 
     def __init__(self, results):
         if not isinstance(results, BaseResults):
@@ -43,30 +42,34 @@ def __init__(self, results):
 
     @property
     def metric_names(self):
+        """Return metric names."""
         return self._metric_names
 
     @property
     def metrics(self):
+        """Return metrics."""
         self._check_is_evaluated()
         return self._metrics
 
     @property
     def metrics_by_strategy(self):
+        """Return metric by strategy."""
         self._check_is_evaluated()
         return self._metrics_by_strategy
 
     @property
     def metrics_by_strategy_dataset(self):
+        """Return metrics by strategy and dataset."""
         self._check_is_evaluated()
         return self._metrics_by_strategy_dataset
 
     def evaluate(self, metric, train_or_test="test", cv_fold="all"):
-        """
+        """Evaluate estimator performance.
+
         Calculates the average prediction error per estimator as well as the
         prediction error achieved by each
         estimator on individual datasets.
         """
-
         # check input
         if isinstance(cv_fold, int) and cv_fold >= 0:
             cv_folds = [cv_fold]  # if single fold, make iterable
@@ -135,7 +138,7 @@ def evaluate(self, metric, train_or_test="test", cv_fold="all"):
         return self._metrics_by_strategy
 
     def plot_boxplots(self, metric_name=None, **kwargs):
-        """Box plot of metric"""
+        """Box plot of metric."""
         self._check_is_evaluated()
         metric_name = self._validate_metric_name(metric_name)
         column = self._get_column_name(metric_name, suffix="mean")
@@ -152,7 +155,8 @@ def plot_boxplots(self, metric_name=None, **kwargs):
         return fig, ax
 
     def rank(self, metric_name=None, ascending=False):
-        """
+        """Determine estimator ranking.
+
         Calculates the average ranks based on the performance of each
         estimator on each dataset
         """
@@ -179,9 +183,7 @@ def rank(self, metric_name=None, ascending=False):
         return ranked
 
     def t_test(self, metric_name=None):
-        """
-        Runs t-test on all possible combinations between the estimators.
-        """
+        """T-test on all possible combinations between the estimators."""
         self._check_is_evaluated()
         metric_name = self._validate_metric_name(metric_name)
         metrics_per_estimator_dataset = self._get_metrics_per_estimator_dataset(
@@ -219,9 +221,8 @@ def t_test(self, metric_name=None):
         return t_df, values_df_multiindex
 
     def sign_test(self, metric_name=None):
-        """
-        Non-parametric test for test for consistent differences between
-        pairs of observations.
+        """Non-parametric test for consistent differences between observation pairs.
+
         See `<https://en.wikipedia.org/wiki/Sign_test>`_ for details about
         the test and
         `<https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy
@@ -253,9 +254,8 @@ def sign_test(self, metric_name=None):
         return sign_df, sign_df_pivot
 
     def ranksum_test(self, metric_name=None):
-        """
-        Non-parametric test for testing consistent differences between pairs
-        of observations.
+        """Non-parametric test of consistent differences between observation pairs.
+
         The test counts the number of observations that are greater, smaller
         and equal to the mean
         `<http://en.wikipedia.org/wiki/Wilcoxon_rank-sum_test>`_.
@@ -295,8 +295,8 @@ def ranksum_test(self, metric_name=None):
         return ranksum_df, values_df_multiindex
 
     def t_test_with_bonferroni_correction(self, metric_name=None, alpha=0.05):
-        """
-        correction used to counteract multiple comparisons
+        """T-test with correction used to counteract multiple comparisons.
+
         https://en.wikipedia.org/wiki/Bonferroni_correction
         """
         self._check_is_evaluated()
@@ -320,13 +320,14 @@ def t_test_with_bonferroni_correction(self, metric_name=None, alpha=0.05):
         return bonfer_df
 
     def wilcoxon_test(self, metric_name=None):
-        """http://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test
+        """Wilcoxon signed-rank test.
+
+        http://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test
         `Wilcoxon signed-rank test
         <https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test>`_.
         Tests whether two  related paired samples come from the same
-        distribution.
-        In particular, it tests whether the distribution of the differences
-        x-y is symmetric about zero
+        distribution. In particular, it tests whether the distribution of the
+        differences x-y is symmetric about zero
         """
         self._check_is_evaluated()
         metric_name = self._validate_metric_name(metric_name)
@@ -355,7 +356,8 @@ def wilcoxon_test(self, metric_name=None):
         return wilcoxon_df
 
     def friedman_test(self, metric_name=None):
-        """
+        """Friedman test.
+
         The Friedman test is a non-parametric statistical test used to
         detect differences
         in treatments across multiple test attempts. The procedure involves
@@ -383,7 +385,8 @@ def friedman_test(self, metric_name=None):
         return friedman_test, values_df
 
     def nemenyi(self, metric_name=None):
-        """
+        """Nemenyi test.
+
         Post-hoc test run if the `friedman_test` reveals statistical
         significance.
         For more information see `Nemenyi test
@@ -406,8 +409,7 @@ def nemenyi(self, metric_name=None):
         return nemenyi
 
     def fit_runtime(self, unit="s", train_or_test="test", cv_fold="all"):
-        """
-        Calculates the average time for fitting the strategy
+        """Calculate the average time for fitting the strategy.
 
         Parameters
         ----------
@@ -419,7 +421,6 @@ def fit_runtime(self, unit="s", train_or_test="test", cv_fold="all"):
         run_times: Pandas DataFrame
             average run times per estimator and strategy
         """
-
         # check input
         if isinstance(cv_fold, int) and cv_fold >= 0:
             cv_folds = [cv_fold]  # if single fold, make iterable
@@ -524,11 +525,11 @@ def fit_runtime(self, unit="s", train_or_test="test", cv_fold="all"):
         # return self._metrics_by_strategy
 
     def plot_critical_difference_diagram(self, metric_name=None, alpha=0.1):
-        """Plot critical difference diagrams
+        """Plot critical difference diagrams.
 
-        References:
-        -----------
-        original implementation by Aaron Bostrom, modified by Markus Löning
+        References
+        ----------
+        original implementation by Aaron Bostrom, modified by Markus Löning.
         """
         self._check_is_evaluated()
         metric_name = self._validate_metric_name(metric_name)
@@ -752,11 +753,11 @@ def plot_critical_difference_diagram(self, metric_name=None, alpha=0.1):
         return fig, ax
 
     def _get_column_name(self, metric_name, suffix="mean"):
-        """Helper function to get column name in computed metrics dataframe"""
+        """Get column name in computed metrics dataframe."""
         return f"{metric_name}_{suffix}"
 
     def _check_is_evaluated(self):
-        """Check if evaluator has evaluated any metrics"""
+        """Check if evaluator has evaluated any metrics."""
         if len(self._metric_names) == 0:
             raise NotEvaluatedError(
                 "This evaluator has not evaluated any metric yet. Please call "
@@ -765,7 +766,7 @@ def _check_is_evaluated(self):
             )
 
     def _validate_metric_name(self, metric_name):
-        """Check if metric has already been evaluated"""
+        """Check if metric has already been evaluated."""
         if metric_name is None:
             metric_name = self._metric_names[
                 -1
@@ -780,7 +781,7 @@ def _validate_metric_name(self, metric_name):
         return metric_name
 
     def _get_metrics_per_estimator_dataset(self, metric_name):
-        """Helper function to get old format back, to be deprecated"""
+        """Get old format back, to be deprecated."""
         # TODO deprecate in favor of new pandas data frame based data
         #  representation
         column = f"{metric_name}_mean"
@@ -795,7 +796,7 @@ def _get_metrics_per_estimator_dataset(self, metric_name):
         return d
 
     def _get_metrics_per_estimator(self, metric_name):
-        """Helper function to get old format back, to be deprecated"""
+        """Get old format back, to be deprecated."""
         # TODO deprecate in favor of new pandas data frame based data
         #  representation
         columns = [
diff --git a/sktime/benchmarking/metrics.py b/sktime/benchmarking/metrics.py
index 9230fa9a9ad..b26d49115c0 100644
--- a/sktime/benchmarking/metrics.py
+++ b/sktime/benchmarking/metrics.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+"""Implements metrics for pairwise and aggregate comparison."""
 __all__ = ["PairwiseMetric", "AggregateMetric"]
 __author__ = ["Viktor Kazakov", "Markus Löning"]
 
@@ -7,6 +9,16 @@
 
 
 class PairwiseMetric(BaseMetric):
+    """Compute metric pairwise.
+
+    Parameters
+    ----------
+    func : function
+        Function that computes the pairwise metric.
+
+    name : str
+        Name of the metric
+    """
 
     def __init__(self, func, name=None, **kwargs):
         name = func.__name__ if name is None else name
@@ -14,27 +26,41 @@ def __init__(self, func, name=None, **kwargs):
         super(PairwiseMetric, self).__init__(name=name, **kwargs)
 
     def compute(self, y_true, y_pred):
+        """Compute metric and standard error."""
         # compute mean
         mean = self.func(y_true, y_pred)
 
         # compute stderr based on pairwise metrics
         n_instances = len(y_true)
         pointwise_metrics = np.array(
-            [self.func([y_true[i]], [y_pred[i]]) for i in range(n_instances)])
+            [self.func([y_true[i]], [y_pred[i]]) for i in range(n_instances)]
+        )
         stderr = np.std(pointwise_metrics) / np.sqrt(
-            n_instances - 1)  # sample standard error of the mean
+            n_instances - 1
+        )  # sample standard error of the mean
 
         return mean, stderr
 
 
 class AggregateMetric(BaseMetric):
+    """Compute metric pairwise.
+
+    Parameters
+    ----------
+    func : function
+        Function that computes the pairwise metric.
+
+    name : str
+        Name of the metric
+    """
 
     def __init__(self, func, method="jackknife", name=None, **kwargs):
         allowed_methods = ("jackknife",)
         if method not in allowed_methods:
             raise NotImplementedError(
                 f"Provided method is not implemented yet. "
-                f"Currently only: {allowed_methods} are implemented")
+                f"Currently only: {allowed_methods} are implemented"
+            )
         self.method = method
 
         name = func.__name__ if name is None else name
@@ -43,10 +69,10 @@ def __init__(self, func, method="jackknife", name=None, **kwargs):
         super(AggregateMetric, self).__init__(name=name, **kwargs)
 
     def compute(self, y_true, y_pred):
-        """Compute metric and standard error
+        """Compute metric and standard error.
 
-        References:
-        -----------
+        References
+        ----------
         .. [1] Efron and Stein, (1981), "The jackknife estimate of variance."
 
         .. [2] McIntosh, Avery. "The Jackknife Estimation Method".
@@ -72,8 +98,8 @@ def compute(self, y_true, y_pred):
 
         # compute metrics on jackknife samples
         jack_pointwise_metric = np.array(
-            [self.func(y_true[idx], y_pred[idx], **self.kwargs)
-             for idx in jack_idx])
+            [self.func(y_true[idx], y_pred[idx], **self.kwargs) for idx in jack_idx]
+        )
 
         # compute standard error over jackknifed metrics
         jack_stderr = self._compute_jackknife_stderr(jack_pointwise_metric)
@@ -81,7 +107,7 @@ def compute(self, y_true, y_pred):
 
     @staticmethod
     def _compute_jackknife_stderr(x):
-        """Compute standard error of jacknife samples
+        """Compute standard error of jacknife samples.
 
         References
         ----------
@@ -93,7 +119,7 @@ def _compute_jackknife_stderr(x):
 
     @staticmethod
     def _jackknife_resampling(x):
-        """Performs jackknife resampling on numpy arrays.
+        """Perform jackknife resampling on numpy arrays.
 
         Jackknife resampling is a technique to generate 'n' deterministic
         samples
diff --git a/sktime/classification/base.py b/sktime/classification/base.py
index c5c0eb553fd..deaaca9f6e2 100644
--- a/sktime/classification/base.py
+++ b/sktime/classification/base.py
@@ -95,12 +95,13 @@ def fit(self, X, y):
 
         Returns
         -------
-        self : reference to self.
+        self :
+            Reference to self.
 
-        State change
-        ------------
-        creates fitted model (attributes ending in "_")
-        sets is_fitted flag to true
+        Notes
+        -----
+        Changes state by creating a fitted model that updates attributes
+        ending in "_" and sets is_fitted flag to True.
         """
         coerce_to_numpy = self.get_class_tag("coerce-X-to-numpy", False)
 
@@ -186,11 +187,13 @@ def _fit(self, X, y):
 
         Returns
         -------
-        self : reference to self.
+        self :
+            Reference to self.
 
-        State change
-        ------------
-        creates fitted model (attributes ending in "_")
+        Notes
+        -----
+        Changes state by creating a fitted model that updates attributes
+        ending in "_" and sets is_fitted flag to True.
         """
         raise NotImplementedError("abstract method")
 
diff --git a/sktime/classification/compose/_column_ensemble.py b/sktime/classification/compose/_column_ensemble.py
index c4806f02d4b..a0ddc3c55e8 100644
--- a/sktime/classification/compose/_column_ensemble.py
+++ b/sktime/classification/compose/_column_ensemble.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
-""" ColumnEnsembleClassifier: For Multivariate Time Series Classification.
-Builds classifiers on each dimension (column) independently
+"""ColumnEnsembleClassifier: For Multivariate Time Series Classification.
 
+Builds classifiers on each dimension (column) independently.
 """
 
 __author__ = ["Aaron Bostrom"]
@@ -57,9 +57,7 @@ def _validate_estimators(self):
 
     # this check whether the column input was a slice object or a tuple.
     def _validate_column_callables(self, X):
-        """
-        Converts callable column specifications.
-        """
+        """Convert callable column specifications."""
         columns = []
         for _, _, column in self.estimators:
             if callable(column):
@@ -68,10 +66,7 @@ def _validate_column_callables(self, X):
         self._columns = columns
 
     def _validate_remainder(self, X):
-        """
-        Validates ``remainder`` and defines ``_remainder`` targeting
-        the remaining columns.
-        """
+        """Validate ``remainder`` and defines ``_remainder``."""
         is_estimator = hasattr(self.remainder, "fit") or hasattr(
             self.remainder, "predict_proba"
         )
@@ -90,15 +85,12 @@ def _validate_remainder(self, X):
         self._remainder = ("remainder", self.remainder, remaining_idx)
 
     def _iter(self, replace_strings=False):
-        """
-        Generate (name, estimator, column) tuples.
+        """Generate (name, estimator, column) tuples.
 
         If fitted=True, use the fitted transformations, else use the
         user specified transformations updated with converted column names
         and potentially appended with transformer for remainder.
-
         """
-
         if self.is_fitted:
             estimators = self.estimators_
         else:
@@ -124,7 +116,7 @@ def _iter(self, replace_strings=False):
 
     def fit(self, X, y):
         # the data passed in could be an array of dataframes?
-        """Fit all estimators, fit the data
+        """Fit all estimators, fit the data.
 
         Parameters
         ----------
@@ -172,7 +164,7 @@ def _collect_probas(self, X):
         )
 
     def predict_proba(self, X):
-        """Predict class probabilities for X in 'soft' voting """
+        """Predict class probabilities for X using 'soft' voting."""
         self.check_is_fitted()
         avg = np.average(self._collect_probas(X), axis=0)
         return avg
@@ -185,44 +177,42 @@ def predict(self, X):
 class ColumnEnsembleClassifier(BaseColumnEnsembleClassifier):
     """Applies estimators to columns of an array or pandas DataFrame.
 
-        This estimator allows different columns or column subsets of the input
-        to be transformed separately and the features generated by each
-        transformer
-        will be ensembled to form a single output.
-
-        Parameters
-        ----------
-        estimators : list of tuples
-            List of (name, transformer, column(s)) tuples specifying the
-            transformer objects to be applied to subsets of the data.
-
-            name : string
-                Like in Pipeline and FeatureUnion, this allows the
-                transformer and
-                its parameters to be set using ``set_params`` and searched
-                in grid
-                search.
-            Estimator : estimator or {'drop'}
-                Estimator must support `fit` and `predict_proba`. Special-cased
-                strings 'drop' and 'passthrough' are accepted as well, to
-                indicate to drop the columns
-            column(s) : string or int, array-like of string or int, slice, \
-                boolean mask array or callable
-
-
-        remainder : {'drop', 'passthrough'} or estimator, default 'drop'
-            By default, only the specified columns in `transformations` are
-            transformed and combined in the output, and the non-specified
-            columns are dropped. (default of ``'drop'``).
-            By specifying ``remainder='passthrough'``, all remaining columns
-            that
-            were not specified in `transformations` will be automatically passed
-            through. This subset of columns is concatenated with the output of
-            the transformations.
-            By setting ``remainder`` to be an estimator, the remaining
-            non-specified columns will use the ``remainder`` estimator. The
-            estimator must support `fit` and `transform`.
+    This estimator allows different columns or column subsets of the input
+    to be transformed separately and the features generated by each
+    transformer
+    will be ensembled to form a single output.
 
+    Parameters
+    ----------
+    estimators : list of tuples
+        List of (name, estimator, column(s)) tuples specifying the
+        transformer objects to be applied to subsets of the data.
+
+        name : string
+            Like in Pipeline and FeatureUnion, this allows the
+            transformer and
+            its parameters to be set using ``set_params`` and searched
+            in grid
+            search.
+        estimator :  or {'drop'}
+            Estimator must support `fit` and `predict_proba`. Special-cased
+            strings 'drop' and 'passthrough' are accepted as well, to
+            indicate to drop the columns
+        column(s) : string or int, array-like of string or int, slice, \
+            boolean mask array or callable
+
+    remainder : {'drop', 'passthrough'} or estimator, default 'drop'
+        By default, only the specified columns in `transformations` are
+        transformed and combined in the output, and the non-specified
+        columns are dropped. (default of ``'drop'``).
+        By specifying ``remainder='passthrough'``, all remaining columns
+        that
+        were not specified in `transformations` will be automatically passed
+        through. This subset of columns is concatenated with the output of
+        the transformations.
+        By setting ``remainder`` to be an estimator, the remaining
+        non-specified columns will use the ``remainder`` estimator. The
+        estimator must support `fit` and `transform`.
     """
 
     _required_parameters = ["estimators"]
@@ -396,10 +386,9 @@ def _get_column_indices(X, key):
 
 
 def _is_empty_column_selection(column):
-    """
-    Return True if the column selection is empty (empty list or all-False
-    boolean array).
+    """Check if column selection is empty.
 
+    Both an empty list or all-False boolean array are considered empty.
     """
     if hasattr(column, "dtype") and np.issubdtype(column.dtype, np.bool_):
         return not column.any()
diff --git a/sktime/classification/dictionary_based/_boss.py b/sktime/classification/dictionary_based/_boss.py
index e170d388ddc..ed0c69d56ac 100644
--- a/sktime/classification/dictionary_based/_boss.py
+++ b/sktime/classification/dictionary_based/_boss.py
@@ -79,11 +79,13 @@ class BOSSEnsemble(BaseClassifier):
 
     See Also
     --------
-    :py:class:`IndividualBOSS`, :py:class:`ContractableBOSS`
+    IndividualBOSS, ContractableBOSS
 
+    Notes
+    -----
     For the Java version, see
-    `TSML <https://github.com/uea-machine-learning/tsml/blob/master/src/
-    main/java/tsml/classifiers/dictionary_based/BOSS.java>`_.
+    `TSML <https://github.com/uea-machine-learning/tsml/blob/master/src/main/java/
+    tsml/classifiers/dictionary_based/BOSS.java>`_.
 
     References
     ----------
@@ -91,8 +93,8 @@ class BOSSEnsemble(BaseClassifier):
        in the presence of noise", Data Mining and Knowledge Discovery, 29(6): 2015
        https://link.springer.com/article/10.1007/s10618-014-0377-7
 
-    Example
-    -------
+    Examples
+    --------
     >>> from sktime.classification.dictionary_based import BOSSEnsemble
     >>> from sktime.datasets import load_italy_power_demand
     >>> X_train, y_train = load_italy_power_demand(split="train", return_X_y=True)
@@ -423,11 +425,13 @@ class IndividualBOSS(BaseClassifier):
 
     See Also
     --------
-    :py:class:`BOSSEnsemble`, :py:class:`ContractableBOSS`
+    BOSSEnsemble, ContractableBOSS
 
+    Notes
+    -----
     For the Java version, see
-    `TSML <https://github.com/uea-machine-learning/tsml/blob/master/src/
-    main/java/tsml/classifiers/dictionary_based/BOSS.java>`_.
+    `TSML <https://github.com/uea-machine-learning/tsml/blob/master/src/main/java/
+    tsml/classifiers/dictionary_based/BOSS.java>`_.
 
     References
     ----------
diff --git a/sktime/classification/dictionary_based/_cboss.py b/sktime/classification/dictionary_based/_cboss.py
index 3659214c71c..f6986348888 100644
--- a/sktime/classification/dictionary_based/_cboss.py
+++ b/sktime/classification/dictionary_based/_cboss.py
@@ -83,11 +83,13 @@ class ContractableBOSS(BaseClassifier):
 
     See Also
     --------
-    :py:class:`BOSSEnsemble`, :py:class:`IndividualBOSS`
+    BOSSEnsemble, IndividualBOSS
 
+    Notes
+    -----
     For the Java version, see
-    `TSML <https://github.com/uea-machine-learning/tsml/blob/master/src/
-    main/java/tsml/classifiers/dictionary_based/cBOSS.java>`_.
+    `TSML <https://github.com/uea-machine-learning/tsml/blob/master/src/main/java/
+    tsml/classifiers/dictionary_based/cBOSS.java>`_.
 
     References
     ----------
diff --git a/sktime/classification/dictionary_based/_muse.py b/sktime/classification/dictionary_based/_muse.py
index be32e3fe47d..d0738bb95c3 100644
--- a/sktime/classification/dictionary_based/_muse.py
+++ b/sktime/classification/dictionary_based/_muse.py
@@ -87,8 +87,8 @@ class MUSE(BaseClassifier):
     https://github.com/uea-machine-learning/tsml/blob/master/src/main/java/tsml/
     classifiers/multivariate/WEASEL_MUSE.java
 
-    Example
-    -------
+    Examples
+    --------
     >>> from sktime.classification.dictionary_based import MUSE
     >>> from sktime.datasets import load_italy_power_demand
     >>> X_train, y_train = load_italy_power_demand(split="train", return_X_y=True)
diff --git a/sktime/classification/dictionary_based/_tde.py b/sktime/classification/dictionary_based/_tde.py
index bbb85a17b7f..4d08c6df86c 100644
--- a/sktime/classification/dictionary_based/_tde.py
+++ b/sktime/classification/dictionary_based/_tde.py
@@ -112,8 +112,8 @@ class TemporalDictionaryEnsemble(BaseClassifier):
     https://github.com/uea-machine-learning/tsml/blob/master/src/main/java/
     tsml/classifiers/dictionary_based/TDE.java
 
-    Example
-    -------
+    Examples
+    --------
     >>> from sktime.classification.dictionary_based import TemporalDictionaryEnsemble
     >>> from sktime.datasets import load_italy_power_demand
     >>> X_train, y_train = load_italy_power_demand(split="train", return_X_y=True)
diff --git a/sktime/classification/dictionary_based/_weasel.py b/sktime/classification/dictionary_based/_weasel.py
index 1642ef44a1a..6988918bc7d 100644
--- a/sktime/classification/dictionary_based/_weasel.py
+++ b/sktime/classification/dictionary_based/_weasel.py
@@ -100,8 +100,8 @@ class WEASEL(BaseClassifier):
     }
     https://dl.acm.org/doi/10.1145/3132847.3132980
 
-    Example
-    -------
+    Examples
+    --------
     >>> from sktime.classification.dictionary_based import WEASEL
     >>> from sktime.datasets import load_italy_power_demand
     >>> X_train, y_train = load_italy_power_demand(split="train", return_X_y=True)
diff --git a/sktime/classification/distance_based/_proximity_forest.py b/sktime/classification/distance_based/_proximity_forest.py
index 9c98895f1db..d026f6ace22 100644
--- a/sktime/classification/distance_based/_proximity_forest.py
+++ b/sktime/classification/distance_based/_proximity_forest.py
@@ -122,7 +122,7 @@ def _derivative_distance(distance_measure, transformer):
 
     :param distance_measure: the distance measure to use
     :param transformer: the transformer to use
-    :return: a distance measure function with built in transformation
+    :returns: a distance measure function with built in transformation
     """
 
     def distance(instance_a, instance_b, **params):
@@ -140,7 +140,7 @@ def distance_predefined_params(distance_measure, **params):
 
     :param distance_measure: the distance measure to use
     :param params: the parameters to use in the distance measure
-    :return: a distance measure with no parameters
+    :returns: a distance measure with no parameters
     """
 
     def distance(instance_a, instance_b):
@@ -154,7 +154,7 @@ def cython_wrapper(distance_measure):
 
      Converts to 1 column per dimension format.
     :param distance_measure: distance measure to wrap
-    :return: a distance measure which automatically formats data for cython
+    :returns: a distance measure which automatically formats data for cython
     distance measures
     """
 
@@ -332,7 +332,7 @@ def dtw_distance_measure_getter(X):
     """Generate the dtw distance measure.
 
     :param X: dataset to derive parameter ranges from
-    :return: distance measure and parameter range dictionary
+    :returns: distance measure and parameter range dictionary
     """
     return {
         "distance_measure": [cython_wrapper(dtw_distance)],
@@ -344,7 +344,7 @@ def msm_distance_measure_getter(X):
     """Generate the msm distance measure.
 
     :param X: dataset to derive parameter ranges from
-    :return: distance measure and parameter range dictionary
+    :returns: distance measure and parameter range dictionary
     """
     n_dimensions = 1  # todo use other dimensions
     return {
@@ -459,7 +459,7 @@ def erp_distance_measure_getter(X):
     """Generate the erp distance measure.
 
     :param X: dataset to derive parameter ranges from
-    :return: distance measure and parameter range dictionary
+    :returns: distance measure and parameter range dictionary
     """
     stdp = _stdp(X)
     instance_length = max_instance_length(X)  # todo should this use the max instance
@@ -479,7 +479,7 @@ def lcss_distance_measure_getter(X):
     """Generate the lcss distance measure.
 
     :param X: dataset to derive parameter ranges from
-    :return: distance measure and parameter range dictionary
+    :returns: distance measure and parameter range dictionary
     """
     stdp = _stdp(X)
     instance_length = max_instance_length(X)  # todo should this use the max instance
@@ -499,7 +499,7 @@ def twe_distance_measure_getter(X):
     """Generate the twe distance measure.
 
     :param X: dataset to derive parameter ranges from
-    :return: distance measure and parameter range dictionary
+    :returns: distance measure and parameter range dictionary
     """
     return {
         "distance_measure": [cython_wrapper(twe_distance)],
@@ -523,7 +523,7 @@ def wdtw_distance_measure_getter(X):
     """Generate the wdtw distance measure.
 
     :param X: dataset to derive parameter ranges from
-    :return: distance measure and parameter range dictionary
+    :returns: distance measure and parameter range dictionary
     """
     return {
         "distance_measure": [cython_wrapper(wdtw_distance)],
@@ -535,7 +535,7 @@ def euclidean_distance_measure_getter(X):
     """Generate the ed distance measure.
 
     :param X: dataset to derive parameter ranges from
-    :return: distance measure and parameter range dictionary
+    :returns: distance measure and parameter range dictionary
     """
     return {"distance_measure": [cython_wrapper(dtw_distance)], "w": [0]}
 
@@ -545,7 +545,7 @@ def setup_wddtw_distance_measure_getter(transformer):
 
     Bakes the derivative transformer into the dtw distance measure
     :param transformer: the transformer to use
-    :return: a getter to produce the distance measure
+    :returns: a getter to produce the distance measure
     """
 
     def getter(X):
@@ -564,7 +564,7 @@ def setup_ddtw_distance_measure_getter(transformer):
 
     Bakes the derivative transformer into the dtw distance measure
     :param transformer: the transformer to use
-    :return: a getter to produce the distance measure
+    :returns: a getter to produce the distance measure
     """
 
     def getter(X):
@@ -582,7 +582,7 @@ def setup_all_distance_measure_getter(proximity):
     """All distance measure getter functions from a proximity object.
 
     :param proximity: a PT / PF / PS
-    :return: a list of distance measure getters
+    :returns: a list of distance measure getters
     """
     transformer = _CachedTransformer(DerivativeSlopeTransformer())
     distance_measure_getters = [
@@ -602,7 +602,7 @@ def pick_rand_distance_measure(proximity):
 
         :param proximity: proximity object containing distance measures,
         ranges and dataset
-        :return: a distance measure with no parameters
+        :returns: a distance measure with no parameters
         """
         random_state = proximity.random_state
         X = proximity.X
@@ -830,7 +830,7 @@ def _distance_to_exemplars_inst(exemplars, instance, distance_measure):
         :param instance: the instance to compare to each exemplar
         :param distance_measure: the distance measure to provide similarity
         values
-        :return: list of distances to each exemplar
+        :returns: list of distances to each exemplar
         """
         n_exemplars = len(exemplars)
         distances = np.empty(n_exemplars)
@@ -853,8 +853,8 @@ def distance_to_exemplars(self, X):
         ----------
         X: the dataset containing a list of instances
 
-        Return
-        ------
+        Returns
+        -------
         2d numpy array of distances from each instance to each
         exemplar (instance by exemplar)
         """
@@ -918,8 +918,8 @@ def find_closest_exemplar_indices(self, X):
         ----------
         X: the dataframe containing instances
 
-        Return
-        ------
+        Returns
+        -------
         1d numpy array of indices, one for each instance,
         reflecting the index of the closest exemplar
         """
diff --git a/sktime/classification/distance_based/_time_series_neighbors.py b/sktime/classification/distance_based/_time_series_neighbors.py
index 87f16dda67e..30735569a17 100644
--- a/sktime/classification/distance_based/_time_series_neighbors.py
+++ b/sktime/classification/distance_based/_time_series_neighbors.py
@@ -94,8 +94,8 @@ class KNeighborsTimeSeriesClassifier(_KNeighborsClassifier, BaseClassifier):
     'wdtw','lcss','erp','msm','twe'}: default ='dtw'
     distance_params   : dictionary for metric parameters: default = None
 
-    Example
-    -------
+    Examples
+    --------
     >>> from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier
     >>> from sktime.datasets import load_basic_motions
     >>> X_train, y_train = load_basic_motions(return_X_y=True, split="train")
diff --git a/sktime/classification/feature_based/_catch22_classifier.py b/sktime/classification/feature_based/_catch22_classifier.py
index a80496991bc..7eb03ed469b 100644
--- a/sktime/classification/feature_based/_catch22_classifier.py
+++ b/sktime/classification/feature_based/_catch22_classifier.py
@@ -46,9 +46,11 @@ class Catch22Classifier(BaseClassifier):
 
     See Also
     --------
-    :py:class:`Catch22`
+    Catch22
 
-    Authors 'catch22ForestClassifier <https://github.com/chlubba/sktime-catch22>`_.
+    Notes
+    -----
+    Authors `catch22ForestClassifier <https://github.com/chlubba/sktime-catch22>`_.
 
     For the Java version, see `tsml <https://github.com/uea-machine-learning/tsml/blob
     /master/src/main/java/tsml/classifiers/hybrids/Catch22Classifier.java>`_.
@@ -59,8 +61,8 @@ class Catch22Classifier(BaseClassifier):
         Data Mining and Knowledge Discovery 33.6 (2019): 1821-1852.
         https://link.springer.com/article/10.1007/s10618-019-00647-x
 
-    Example
-    -------
+    Examples
+    --------
     >>> from sktime.classification.feature_based import Catch22Classifier
     >>> from sktime.datasets import load_italy_power_demand
     >>> X_train, y_train = load_italy_power_demand(split="train", return_X_y=True)
diff --git a/sktime/classification/feature_based/_matrix_profile_classifier.py b/sktime/classification/feature_based/_matrix_profile_classifier.py
index c3aa7c87f54..8f30f1f3a4e 100644
--- a/sktime/classification/feature_based/_matrix_profile_classifier.py
+++ b/sktime/classification/feature_based/_matrix_profile_classifier.py
@@ -44,7 +44,7 @@ class MatrixProfileClassifier(BaseClassifier):
 
     See Also
     --------
-    :py:class:`MatrixProfile`
+    MatrixProfile
 
     References
     ----------
@@ -53,8 +53,8 @@ class MatrixProfileClassifier(BaseClassifier):
         Knowledge Discovery 32.1 (2018): 83-123.
         https://link.springer.com/article/10.1007/s10618-017-0519-9
 
-    Example
-    -------
+    Examples
+    --------
     >>> from sktime.classification.feature_based import MatrixProfileClassifier
     >>> from sktime.datasets import load_italy_power_demand
     >>> X_train, y_train = load_italy_power_demand(split="train", return_X_y=True)
diff --git a/sktime/classification/feature_based/_signature_classifier.py b/sktime/classification/feature_based/_signature_classifier.py
index f612faebd0d..de1938c5dfd 100644
--- a/sktime/classification/feature_based/_signature_classifier.py
+++ b/sktime/classification/feature_based/_signature_classifier.py
@@ -82,10 +82,10 @@ class SignatureClassifier(BaseClassifier):
 
     See Also
     --------
-    :py:class:`SignatureTransformer`
+    SignatureTransformer
 
-    Example
-    -------
+    Examples
+    --------
     >>> from sktime.classification.feature_based import SignatureClassifier
     >>> from sktime.datasets import load_italy_power_demand
     >>> X_train, y_train = load_italy_power_demand(split="train", return_X_y=True)
diff --git a/sktime/classification/feature_based/_tsfresh_classifier.py b/sktime/classification/feature_based/_tsfresh_classifier.py
index 6a12072a5fe..fe490939875 100644
--- a/sktime/classification/feature_based/_tsfresh_classifier.py
+++ b/sktime/classification/feature_based/_tsfresh_classifier.py
@@ -56,7 +56,7 @@ class TSFreshClassifier(BaseClassifier):
 
     See Also
     --------
-    :py:class:`TSFreshFeatureExtractor`, :py:class:`TSFreshRelevantFeatureExtractor`
+    TSFreshFeatureExtractor, TSFreshRelevantFeatureExtractor
 
     References
     ----------
@@ -65,8 +65,8 @@ class TSFreshClassifier(BaseClassifier):
         (2018): 72-77.
         https://www.sciencedirect.com/science/article/pii/S0925231218304843
 
-    Example
-    -------
+    Examples
+    --------
     >>> from sktime.classification.feature_based import TSFreshClassifier
     >>> from sktime.datasets import load_italy_power_demand
     >>> X_train, y_train = load_italy_power_demand(split="train", return_X_y=True)
diff --git a/sktime/classification/interval_based/_rise.py b/sktime/classification/interval_based/_rise.py
index aa02ffd357e..5d6cd46b38d 100644
--- a/sktime/classification/interval_based/_rise.py
+++ b/sktime/classification/interval_based/_rise.py
@@ -74,9 +74,7 @@ def _make_estimator(base_estimator, random_state=None):
 
 
 def _select_interval(min_interval, max_interval, series_length, rng, method=3):
-    """
-    private function used to select an interval for a single tree
-    """
+    """Private function used to select an interval for a single tree."""
     interval = np.empty(2, dtype=int)
     if method == 0:
         interval[0] = rng.randint(series_length - min_interval)
@@ -102,9 +100,7 @@ def _select_interval(min_interval, max_interval, series_length, rng, method=3):
 def _produce_intervals(
     n_estimators, min_interval, max_interval, series_length, rng, method=3
 ):
-    """
-    private function used to produce intervals for all trees
-    """
+    """Private function used to produce intervals for all trees."""
     intervals = np.empty((n_estimators, 2), dtype=int)
     if method == 0:
         # just keep it as a backup, untested
@@ -312,7 +308,9 @@ def fit(self, X, y):
         return self
 
     def predict(self, X):
-        """Find predictions for all cases in X. Built on top of `predict_proba.
+        """Find predictions for all cases in X.
+
+        Built on top of `predict_proba`.
 
         Parameters
         ----------
@@ -339,8 +337,8 @@ def predict_proba(self, X):
             single column (i.e., univariate classification). RISE has no
             bespoke method for multivariate classification as yet.
 
-        Local variables
-        ---------------
+        Attributes
+        ----------
         n_instances : int
             Number of cases to classify.
         n_columns : int
@@ -396,8 +394,8 @@ def acf(x, max_lag):
     max_lag: int
         The number of ACF terms to find.
 
-    Return
-    ----------
+    Returns
+    -------
     y : array-like shape = [max_lag]
     """
     y = np.empty(max_lag)
@@ -499,8 +497,8 @@ def matrix_acf(x, num_cases, max_lag):
     max_lag: int
         The number of ACF terms to find.
 
-    Return
-    ----------
+    Returns
+    -------
     y : array-like shape = [num_cases,max_lag]
 
     """
@@ -542,7 +540,8 @@ def matrix_acf(x, num_cases, max_lag):
 
 
 def ps(x, sign=1, n=None, pad="mean"):
-    """
+    """Power spectrum transformer.
+
     Power spectrum transform, currently calculated using np function.
     It would be worth looking at ff implementation, see difference in speed
     to java.
@@ -557,8 +556,8 @@ def ps(x, sign=1, n=None, pad="mean"):
         see numpy.pad for more details
         https://numpy.org/doc/stable/reference/generated/numpy.pad.html
 
-    Return
-    ----------
+    Returns
+    -------
     y : array-like shape = [len(x)/2]
     """
     x_len = x.shape[-1]
diff --git a/sktime/classification/shapelet_based/_stc.py b/sktime/classification/shapelet_based/_stc.py
index 2e06979f5c8..3ce223dba56 100644
--- a/sktime/classification/shapelet_based/_stc.py
+++ b/sktime/classification/shapelet_based/_stc.py
@@ -79,6 +79,8 @@ def fit(self, X, y):
         """Perform a shapelet transform then builds a random forest.
 
         Contract default for ST is 5 hours
+
+        Parameters
         ----------
         X : array-like or sparse matrix of shape = [n_instances,
         series_length] or shape = [n_instances,n_columns]
diff --git a/sktime/classification/shapelet_based/mrseql/mrseql.pyx b/sktime/classification/shapelet_based/mrseql/mrseql.pyx
index 02c8aac77bc..076cf8d0ffe 100644
--- a/sktime/classification/shapelet_based/mrseql/mrseql.pyx
+++ b/sktime/classification/shapelet_based/mrseql/mrseql.pyx
@@ -386,11 +386,12 @@ class MrSEQLClassifier(BaseClassifier):
             return self.seql_clf.predict_proba(mr_seqs)
 
     def predict(self, X):
-        """
-        Predict class labels for samples in X.
+        """Predict class labels for samples in X.
+
         Parameters
         ----------
         X : time series data.
+
         Returns
         -------
         C : array
@@ -400,7 +401,8 @@ class MrSEQLClassifier(BaseClassifier):
         return np.array([self.classes_[np.argmax(prob)] for prob in proba])
 
     def map_sax_model(self, ts):
-        """    For interpretation.
+        """For interpretation.
+
         Returns vectors of weights with the same length of the input time series.
         The weight of each point implies its contribution in the classification decision regarding the class.
 
@@ -412,8 +414,8 @@ class MrSEQLClassifier(BaseClassifier):
         -------
         weighted_ts: ndarray of (number of classes, length of time series)
 
-        Note
-        -------
+        Notes
+        -----
         Only supports univariate time series and SAX features.
         """
         self.check_is_fitted()
diff --git a/sktime/datasets/__init__.py b/sktime/datasets/__init__.py
index 362301b36fd..2fc7692ac29 100644
--- a/sktime/datasets/__init__.py
+++ b/sktime/datasets/__init__.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-"""Load data functions."""
+"""Functions to load datasets included in sktime."""
 
 __all__ = [
     "load_airline",
@@ -16,6 +16,7 @@
     "load_unit_test",
     "load_uschange",
     "load_PBS_dataset",
+    "load_japanese_vowels",
 ]
 
 from sktime.datasets._data_io import load_airline
diff --git a/sktime/datasets/_data_io.py b/sktime/datasets/_data_io.py
index bc7a1d64ec0..c356a728dc0 100644
--- a/sktime/datasets/_data_io.py
+++ b/sktime/datasets/_data_io.py
@@ -224,13 +224,15 @@ def load_gunpoint(split=None, return_X_y=False):
         The time series data for the problem with m cases and c dimensions
     y: numpy array
         The class labels for each case in X
-    Details
-    -------
+
+    Notes
+    -----
     Dimensionality:     univariate
     Series length:      150
     Train cases:        50
     Test cases:         150
     Number of classes:  2
+
     This dataset involves one female actor and one male actor making a
     motion with their
     hand. The two classes are: Gun-Draw and Point: For Gun-Draw the actors
@@ -276,8 +278,8 @@ def load_osuleaf(split=None, return_X_y=False):
     y: numpy array
         The class labels for each case in X
 
-    Details
-    -------
+    Notes
+    -----
     Dimensionality:     univariate
     Series length:      427
     Train cases:        200
@@ -319,8 +321,8 @@ def load_italy_power_demand(split=None, return_X_y=False):
     y: numpy array
         The class labels for each case in X
 
-    Details
-    -------
+    Notes
+    -----
     Dimensionality:     univariate
     Series length:      24
     Train cases:        67
@@ -395,8 +397,8 @@ def load_japanese_vowels(split=None, return_X_y=False):
     y: numpy array
         The class labels for each case in X
 
-    Details
-    -------
+    Notes
+    -----
     Dimensionality:     multivariate, 12
     Series length:      29
     Train cases:        270
@@ -448,8 +450,8 @@ def load_arrow_head(split=None, return_X_y=False):
     y: numpy array
         The class labels for each case in X
 
-    Details
-    -------
+    Notes
+    -----
     Dimensionality:     univariate
     Series length:      251
     Train cases:        36
@@ -496,8 +498,8 @@ def load_acsf1(split=None, return_X_y=False):
     y: numpy array
         The class labels for each case in X
 
-    Details
-    -------
+    Notes
+    -----
     Dimensionality:     univariate
     Series length:      1460
     Train cases:        100
@@ -541,8 +543,8 @@ def load_basic_motions(split=None, return_X_y=False):
     y: numpy array
         The class labels for each case in X
 
-    Details
-    -------
+    Notes
+    -----
     Dimensionality:     univariate
     Series length:      100
     Train cases:        40
@@ -573,8 +575,8 @@ def load_shampoo_sales():
     y : pandas Series/DataFrame
         Shampoo sales dataset
 
-    Details
-    -------
+    Notes
+    -----
     This dataset describes the monthly number of sales of shampoo over a 3
     year period.
     The units are a sales count.
@@ -616,8 +618,8 @@ def load_longley(y_name="TOTEMP"):
     X: pandas.DataFrame
         The exogenous time series data for the problem.
 
-    Details
-    -------
+    Notes
+    -----
     This mulitvariate time series dataset contains various US macroeconomic
     variables from 1947 to 1962 that are known to be highly collinear.
 
@@ -664,8 +666,8 @@ def load_lynx():
     y : pandas Series/DataFrame
         Lynx sales dataset
 
-    Details
-    -------
+    Notes
+    -----
     The annual numbers of lynx trappings for 1821–1934 in Canada. This
     time-series records the number of skins of
     predators (lynx) that were collected over several years by the Hudson's
@@ -678,8 +680,6 @@ def load_lynx():
     Frequency:          Yearly
     Number of cases:    1
 
-    Notes
-    -----
     This data shows aperiodic, cyclical patterns, as opposed to periodic,
     seasonal patterns.
 
@@ -712,8 +712,8 @@ def load_airline():
     y : pd.Series
      Time series
 
-    Details
-    -------
+    Notes
+    -----
     The classic Box & Jenkins airline data. Monthly totals of international
     airline passengers, 1949 to 1960.
 
@@ -722,8 +722,6 @@ def load_airline():
     Frequency:          Monthly
     Number of cases:    1
 
-    Notes
-    -----
     This data shows an increasing trend, non-constant (increasing) variance
     and periodic, seasonal patterns.
 
@@ -755,8 +753,8 @@ def load_uschange(y_name="Consumption"):
     X : pandas Dataframe
         columns with explanatory variables
 
-    Details
-    -------
+    Notes
+    -----
     Percentage changes in quarterly personal consumption expenditure,
     personal disposable income, production, savings and the
     unemployment rate for the US, 1960 to 2016.
@@ -769,8 +767,6 @@ def load_uschange(y_name="Consumption"):
     Frequency:          Quarterly
     Number of cases:    1
 
-    Notes
-    -----
     This data shows an increasing trend, non-constant (increasing) variance
     and periodic, seasonal patterns.
 
@@ -798,16 +794,15 @@ def load_uschange(y_name="Consumption"):
 
 
 def load_PBS_dataset():
-    """
-    Load the Pharmaceutical Benefit Scheme univariate time series dataset [1].
+    """Load the Pharmaceutical Benefit Scheme univariate time series dataset [1].
 
     Returns
     -------
     y : pd.Series
      Time series
 
-    Details
-    -------
+    Notes
+    -----
     The Pharmaceutical Benefits Scheme (PBS) is the Australian government drugs
     subsidy scheme.
     Data comprises of the numbers of scripts sold each month for immune sera
@@ -819,8 +814,6 @@ def load_PBS_dataset():
     Frequency:          Monthly
     Number of cases:    1
 
-    Notes
-    -----
     The time series is intermittent, i.e contains small counts,
     with many months registering no sales at all,
     and only small numbers of items sold in other months.
diff --git a/sktime/datasets/setup.py b/sktime/datasets/setup.py
index 7d29e0208da..8360b9f9278 100644
--- a/sktime/datasets/setup.py
+++ b/sktime/datasets/setup.py
@@ -2,11 +2,14 @@
 # -*- coding: utf-8 -*-
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
 """Set up the datasets included in sktime."""
+
 __author__ = "Markus Löning"
 
+# The file is adapted from:
+# https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/datasets/setup.py
+
 
-# adapted from https://github.com/scikit-learn/scikit-learn/blob/master
-# /sklearn/datasets/setup.py
+___author__ = ["mloning"]
 
 
 def configuration(parent_package="", top_path=None):
diff --git a/sktime/datasets/tsc_dataset_names.py b/sktime/datasets/tsc_dataset_names.py
index f86b79f81f0..8202f8ee9bd 100644
--- a/sktime/datasets/tsc_dataset_names.py
+++ b/sktime/datasets/tsc_dataset_names.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
+
 """
-Lists of datasets available from the archive on timeseriesclassification.com
+List of datasets available from the timeseriesclassification.com archive.
 
 There are four main distinctions: univariate/multivariate equal/unequal length.
 Array univariate lists the 128 UCR problems, as described in [1].
diff --git a/sktime/forecasting/__init__.py b/sktime/forecasting/__init__.py
index e69de29bb2d..be217575161 100644
--- a/sktime/forecasting/__init__.py
+++ b/sktime/forecasting/__init__.py
@@ -0,0 +1,2 @@
+# -*- coding: utf-8 -*-
+"""Implements univariate and multivariate forecasting models."""
diff --git a/sktime/forecasting/all/__init__.py b/sktime/forecasting/all/__init__.py
index a47faabc2e2..262dabfd750 100644
--- a/sktime/forecasting/all/__init__.py
+++ b/sktime/forecasting/all/__init__.py
@@ -1,8 +1,9 @@
-#!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
-"""Module exports for forecasting module."""
+# !/usr/bin/env python3 -u
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Import all time series forecasting functionality available in sktime."""
 
-__author__ = ["Markus Löning"]
+__author__ = ["mloning"]
 __all__ = [
     "ForecastingHorizon",
     "load_lynx",
diff --git a/sktime/forecasting/arima.py b/sktime/forecasting/arima.py
index 658c4a4ec25..ef852980536 100644
--- a/sktime/forecasting/arima.py
+++ b/sktime/forecasting/arima.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements autoregressive integrated moving average (ARIMA) models."""
 
 __author__ = ["Markus Löning", "Hongyi Yang"]
 __all__ = ["AutoARIMA", "ARIMA"]
@@ -211,8 +212,8 @@ class AutoARIMA(_PmdArimaAdapter):
     ----------
     https://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.AutoARIMA.html
 
-    Example
-    ----------
+    Examples
+    --------
     >>> from sktime.datasets import load_airline
     >>> from sktime.forecasting.arima import AutoARIMA
     >>> y = load_airline()
@@ -386,7 +387,8 @@ class ARIMA(_PmdArimaAdapter):
     When two out of the three terms are zeros, the model may be referred to
     based on the non-zero parameter, dropping "AR", "I" or "MA" from the
     acronym describing the model. For example, ``ARIMA(1,0,0)`` is ``AR(1)``,
-    ``ARIMA(0,1,0)`` is ``I(1)``, and ``ARIMA(0,0,1)`` is ``MA(1)``. [1]
+    ``ARIMA(0,1,0)`` is ``I(1)``, and ``ARIMA(0,0,1)`` is ``MA(1)``. [1]_
+
     See notes for more practical information on the ``ARIMA`` class.
 
     Parameters
@@ -509,13 +511,14 @@ def foo_loss(y_true, y_pred)
 
     References
     ----------
-    https://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.ARIMA.html
-    https://www.statsmodels.org/stable/generated/statsmodels.tsa.statespace.sarimax.SARIMAX.html
+    ..[1] https://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.ARIMA.html
+    ..[2] https://www.statsmodels.org/stable/generated/
+      statsmodels.tsa.statespace.sarimax.SARIMAX.html
 
-    Example
-    ----------
+    Examples
+    --------
     >>> from sktime.datasets import load_airline
-    >>> from sktime.forecasting.arima import AutoARIMA
+    >>> from sktime.forecasting.arima import ARIMA
     >>> y = load_airline()
     >>> forecaster = ARIMA(
     ...     order=(1, 1, 0),
diff --git a/sktime/forecasting/base/__init__.py b/sktime/forecasting/base/__init__.py
index c92f05921cd..2d6be2fb551 100644
--- a/sktime/forecasting/base/__init__.py
+++ b/sktime/forecasting/base/__init__.py
@@ -1,4 +1,8 @@
 # -*- coding: utf-8 -*-
+# !/usr/bin/env python3 -u
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements base classes for forecasting in sktime."""
+
 __all__ = [
     "ForecastingHorizon",
     "BaseForecaster",
diff --git a/sktime/forecasting/base/_base.py b/sktime/forecasting/base/_base.py
index f3ac91aac57..e24821b0194 100644
--- a/sktime/forecasting/base/_base.py
+++ b/sktime/forecasting/base/_base.py
@@ -103,10 +103,13 @@ def fit(self, y, X=None, fh=None):
             Exogeneous data
         Returns
         -------
-        self : reference to self.
+        self :
+            Reference to self.
 
-        State change
-        ------------
+        Notes
+        -----
+        Changes state by creating a fitted model that updates attributes
+        ending in "_" and sets is_fitted flag to True.
         stores data in self._X and self._y
         stores fh, if passed
         updates self.cutoff to most recent time in y
@@ -355,11 +358,11 @@ def update(self, y, X=None, update_params=True):
         -------
         self : reference to self
 
-        State change
-        ------------
-        updates self._X and self._y with new data
-        updates self.cutoff to most recent time in y
-        if update_params=True, updates model (attributes ending in "_")
+        Notes
+        -----
+        Update self._y and self._X with `y` and `X`, respectively.
+        Updates  self._cutoff to last index seen in `y`. If update_params=True,
+        updates fitted model that updates attributes ending in "_".
         """
         self.check_is_fitted()
 
@@ -662,9 +665,9 @@ def _set_cutoff(self, cutoff):
         ----------
         cutoff: pandas compatible index element
 
-        State change
-        ------------
-        self._cutoff is set to cutoff
+        Notes
+        -----
+        Set self._cutoff is to `cutoff`.
         """
         self._cutoff = cutoff
 
@@ -676,9 +679,9 @@ def _set_cutoff_from_y(self, y):
         y: pd.Series, pd.DataFrame, or np.array
             Target time series to which to fit the forecaster.
 
-        State change
-        ------------
-        self._cutoff is set to last index seen in y
+        Notes
+        -----
+        Set self._cutoff to last index seen in `y`.
         """
         if mtype(y, as_scitype="Series") in ["pd.Series", "pd.DataFrame"]:
             self._cutoff = y.index[-1]
@@ -855,11 +858,11 @@ def _update(self, y, X=None, update_params=True):
         y_pred_int : pd.DataFrame - only if return_pred_int=True
             Prediction intervals
 
-        State change
-        ------------
-        updates self._X and self._y with new data
-        updates self.cutoff to most recent time in y
-        if update_params=True, updates model (attributes ending in "_")
+        Notes
+        -----
+        Update self._y and self._X with `y` and `X`, respectively.
+        Updates  self._cutoff to last index seen in `y`. If update_params=True,
+        updates fitted model that updates attributes ending in "_".
         """
         if update_params:
             # default to re-fitting if update is not implemented
diff --git a/sktime/forecasting/base/_fh.py b/sktime/forecasting/base/_fh.py
index 1c3c7ae1276..443b62c145d 100644
--- a/sktime/forecasting/base/_fh.py
+++ b/sktime/forecasting/base/_fh.py
@@ -1,5 +1,7 @@
 # -*- coding: utf-8 -*-
+# !/usr/bin/env python3 -u
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements functionality for specifying forecast horizons in sktime."""
 
 __author__ = ["mloning", "fkiraly"]
 __all__ = ["ForecastingHorizon"]
@@ -46,11 +48,10 @@
 
 
 def _delegator(method):
-    """Decorate ForecastingHorizon with pandas.index methods.
+    """Automatically decorate ForecastingHorizon class with pandas.Index methods.
 
-    Helper function to automatically decorate ForecastingHorizon class with
+    Also delegates method calls to wrapped pandas.Index object.
     methods from pandas.Index and delegate method calls to wrapped pandas.Index
-    object.
     """
 
     def delegated(obj, *args, **kwargs):
@@ -60,7 +61,10 @@ def delegated(obj, *args, **kwargs):
 
 
 def _check_values(values):
-    """Validate forecasting horizon values and coerce to pandas.Index type.
+    """Validate forecasting horizon values.
+
+    Validation checks validity and also converts forecasting horizon values
+    to supported pandas.Index types if possible.
 
     Parameters
     ----------
@@ -69,7 +73,8 @@ def _check_values(values):
 
     Raises
     ------
-    TypeError : if values type is not supported
+    TypeError :
+        Raised if `values` type is not supported
 
     Returns
     -------
@@ -178,15 +183,15 @@ def _new(self, values=None, is_relative=None):
         Parameters
         ----------
         values : pd.Index, np.array, list or int
-            Values of forecasting horizon
-        is_relative : bool, optional (default=same as self.is_relative)
+            Values of forecasting horizon.
+        is_relative : bool, default=same as self.is_relative
         - If None, determined automatically: same as self.is_relative
         - If True, values are relative to end of training series.
         - If False, values are absolute.
 
         Returns
         -------
-        ForecastingHorizon
+        ForecastingHorizon :
             New ForecastingHorizon based on current object
         """
         if values is None:
@@ -197,7 +202,7 @@ def _new(self, values=None, is_relative=None):
 
     @property
     def is_relative(self):
-        """Whether forecasting horizon is relative.
+        """Whether forecasting horizon is relative to the end of the training series.
 
         Returns
         -------
@@ -206,16 +211,17 @@ def is_relative(self):
         return self._is_relative
 
     def to_pandas(self):
-        """Return underlying values as pd.Index.
+        """Return forecasting horizon's underlying values as pd.Index.
 
         Returns
         -------
         fh : pd.Index
+            pandas Index containing forecasting horizon's underlying values.
         """
         return self._values
 
     def to_numpy(self, **kwargs):
-        """Return underlying values as np.array.
+        """Return forecasting horizon's underlying values as np.array.
 
         Parameters
         ----------
@@ -225,6 +231,7 @@ def to_numpy(self, **kwargs):
         Returns
         -------
         fh : np.ndarray
+            NumPy array containg forecasting horizon's underlying values.
         """
         return self.to_pandas().to_numpy(**kwargs)
 
@@ -233,18 +240,18 @@ def to_numpy(self, **kwargs):
     # calling different methods.
     @lru_cache(typed=True)
     def to_relative(self, cutoff=None):
-        """Return relative values.
+        """Return forecasting horizon values relative to a cutoff.
 
         Parameters
         ----------
         cutoff : pd.Period, pd.Timestamp, int, optional (default=None)
-            Cutoff value is required to convert a relative forecasting
-            horizon to an absolute one and vice versa.
+            Cutoff value required to convert a relative forecasting
+            horizon to an absolute one (and vice versa).
 
         Returns
         -------
         fh : ForecastingHorizon
-            Relative representation of forecasting horizon
+            Relative representation of forecasting horizon.
         """
         if self.is_relative:
             return self._new()
@@ -275,18 +282,18 @@ def to_relative(self, cutoff=None):
 
     @lru_cache(typed=True)
     def to_absolute(self, cutoff):
-        """Convert ForecastingHorizon to absolute and return.
+        """Return absolute version of forecasting horizon values.
 
         Parameters
         ----------
         cutoff : pd.Period, pd.Timestamp, int
             Cutoff value is required to convert a relative forecasting
-            horizon to an absolute one and vice versa.
+            horizon to an absolute one (and vice versa).
 
         Returns
         -------
         fh : ForecastingHorizon
-            Absolute representation of forecasting horizon
+            Absolute representation of forecasting horizon.
         """
         if not self.is_relative:
             return self._new()
@@ -318,14 +325,14 @@ def to_absolute_int(self, start, cutoff=None):
         start : pd.Period, pd.Timestamp, int
             Start value returned as zero.
         cutoff : pd.Period, pd.Timestamp, int, optional (default=None)
-            Cutoff value is required to convert a relative forecasting
-            horizon to an absolute one and vice versa.
+            Cutoff value required to convert a relative forecasting
+            horizon to an absolute one (and vice versa).
 
         Returns
         -------
         fh : ForecastingHorizon
             Absolute representation of forecasting horizon as zero-based
-            integer index
+            integer index.
         """
         # We here check the start value, the cutoff value is checked when we use it
         # to convert the horizon to the absolute representation below
@@ -349,13 +356,13 @@ def to_in_sample(self, cutoff=None):
         Parameters
         ----------
         cutoff : pd.Period, pd.Timestamp, int, optional (default=None)
-            Cutoff value is required to convert a relative forecasting
-            horizon to an absolute one and vice versa.
+            Cutoff value required to convert a relative forecasting
+            horizon to an absolute one (and vice versa).
 
         Returns
         -------
         fh : ForecastingHorizon
-            In-sample values of forecasting horizon
+            In-sample values of forecasting horizon.
         """
         is_in_sample = self._is_in_sample(cutoff)
         in_sample = self.to_pandas()[is_in_sample]
@@ -368,12 +375,12 @@ def to_out_of_sample(self, cutoff=None):
         ----------
         cutoff : pd.Period, pd.Timestamp, int, optional (default=None)
             Cutoff value is required to convert a relative forecasting
-            horizon to an absolute one and vice versa.
+            horizon to an absolute one (and vice versa).
 
         Returns
         -------
         fh : ForecastingHorizon
-            Out-of-sample values of forecasting horizon
+            Out-of-sample values of forecasting horizon.
         """
         is_out_of_sample = self._is_out_of_sample(cutoff)
         out_of_sample = self.to_pandas()[is_out_of_sample]
@@ -384,13 +391,12 @@ def _is_in_sample(self, cutoff=None):
         return self.to_relative(cutoff).to_pandas() <= 0
 
     def is_all_in_sample(self, cutoff=None):
-        """Whether or not the fh is purely in-sample given cutoff, yes/no.
+        """Whether the forecasting horizon is purely in-sample for given cutoff.
 
         Parameters
         ----------
-        cutoff : pd.Period, pd.Timestamp, int, optional (default=None)
-            Cutoff value is required to convert a relative forecasting
-            horizon to an absolute one and vice versa.
+        cutoff : pd.Period, pd.Timestamp, int, default=None
+            Cutoff value used to check if forecasting horizon is purely in-sample.
 
         Returns
         -------
@@ -405,13 +411,13 @@ def _is_out_of_sample(self, cutoff=None):
         return self.to_relative(cutoff).to_pandas() > 0
 
     def is_all_out_of_sample(self, cutoff=None):
-        """Whether or not the fh is purely out-of-sample given cutoff, yes/no.
+        """Whether the forecasting horizon is purely out-of-sample for given cutoff.
 
         Parameters
         ----------
         cutoff : pd.Period, pd.Timestamp, int, optional (default=None)
-            Cutoff value is required to convert a relative forecasting
-            horizon to an absolute one and vice versa.
+            Cutoff value used to check if forecasting horizon is purely
+            out-of-sample.
 
         Returns
         -------
@@ -427,7 +433,7 @@ def to_indexer(self, cutoff=None, from_cutoff=True):
         Parameters
         ----------
         cutoff : pd.Period, pd.Timestamp, int, optional (default=None)
-            Cutoff value is required to convert a relative forecasting
+            Cutoff value required to convert a relative forecasting
             horizon to an absolute one and vice versa.
         from_cutoff : bool, optional (default=True)
             - If True, zero-based relative to cutoff.
@@ -437,7 +443,7 @@ def to_indexer(self, cutoff=None, from_cutoff=True):
         Returns
         -------
         fh : pd.Index
-            Indexer
+            Indexer.
         """
         if from_cutoff:
             return self.to_relative(cutoff).to_pandas() - 1
@@ -453,10 +459,19 @@ def __repr__(self):
 
 
 def _check_cutoff(cutoff, index):
-    """Check whether cutoff is compatible with fh index type.
+    """Check if the cutoff is valid based on time index of forecasting horizon.
 
-    Helper function to check if the cutoff contains all necessary information and is
-    compatible with the time index of the forecasting horizon
+    Validates that the cutoff contains necessary information and is
+    compatible with the time index of the forecasting horizon.
+
+    Parameters
+    ----------
+    cutoff : pd.Period, pd.Timestamp, int, optional (default=None)
+        Cutoff value is required to convert a relative forecasting
+        horizon to an absolute one and vice versa.
+    index : pd.PeriodIndex or pd.DataTimeIndex
+        Forecasting horizon time index that the cutoff value will be checked
+        against.
     """
     if cutoff is None:
         raise ValueError("`cutoff` must be given, but found none.")
@@ -491,10 +506,22 @@ def _check_start(start, index):
 
 
 def _coerce_to_period(x, freq=None):
-    """Coerce compatible index type to pd.PeriodIndex.
+    """Coerce pandas time index to a alternative pandas time index.
 
-    Helper function to coerce pd.Timestamp to pd.Period or pd.DatetimeIndex to
-    pd.PeriodIndex for more reliable arithmetic operations with time indices
+    This coerces pd.Timestamp to pd.Period or pd.DatetimeIndex to
+    pd.PeriodIndex, because pd.Period and pd.PeriodIndex allow more reliable
+    arithmetic operations with time indices.
+
+    Parameters
+    ----------
+    x : pandas Index
+        pandas Index to convert.
+    freq :
+
+    Returns
+    -------
+    index : pd.Period or pd.PeriodIndex
+        Index coerced to preferred format.
     """
     if freq is None:
         freq = _get_freq(x)
diff --git a/sktime/forecasting/base/_meta.py b/sktime/forecasting/base/_meta.py
index 104a4b0017b..e13f70f5911 100644
--- a/sktime/forecasting/base/_meta.py
+++ b/sktime/forecasting/base/_meta.py
@@ -2,6 +2,8 @@
 # -*- coding: utf-8 -*-
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
 
+"""Implements meta forecaster for forecasters composed of other estimators."""
+
 __author__ = ["mloning"]
 __all__ = ["_HeterogenousEnsembleForecaster"]
 
diff --git a/sktime/forecasting/base/_sktime.py b/sktime/forecasting/base/_sktime.py
index 88762de8fd7..1a3c65a364f 100644
--- a/sktime/forecasting/base/_sktime.py
+++ b/sktime/forecasting/base/_sktime.py
@@ -1,9 +1,7 @@
 # -*- coding: utf-8 -*-
-"""
-sktime window forecaster base class
-
+# !/usr/bin/env python3 -u
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
-"""
+"""sktime window forecaster base class."""
 
 __author__ = ["@mloning", "@big-o"]
 __all__ = ["_BaseWindowForecaster"]
@@ -20,7 +18,7 @@
 
 
 class _BaseWindowForecaster(BaseForecaster):
-    """Base class for forecasters that use."""
+    """Base class for forecasters that use sliding windows."""
 
     def __init__(self, window_length=None):
         super(_BaseWindowForecaster, self).__init__()
diff --git a/sktime/forecasting/base/adapters/__init__.py b/sktime/forecasting/base/adapters/__init__.py
index df56a03cfff..179f57f98bb 100644
--- a/sktime/forecasting/base/adapters/__init__.py
+++ b/sktime/forecasting/base/adapters/__init__.py
@@ -1,5 +1,7 @@
-#!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# !/usr/bin/env python3 -u
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements base classes for adapting other forecasters to sktime framework."""
 
 __author__ = ["Markus Löning"]
 __all__ = [
diff --git a/sktime/forecasting/base/adapters/_fbprophet.py b/sktime/forecasting/base/adapters/_fbprophet.py
index 82066d8bd1b..fb29989c3c5 100644
--- a/sktime/forecasting/base/adapters/_fbprophet.py
+++ b/sktime/forecasting/base/adapters/_fbprophet.py
@@ -1,6 +1,7 @@
-#!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# !/usr/bin/env python3 -u
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements adapter for Facebook prophet to be used in sktime framework."""
 
 __author__ = ["Markus Löning", "Martin Walter"]
 __all__ = ["_ProphetAdapter"]
@@ -207,6 +208,7 @@ class _suppress_stdout_stderr(object):
     to stderr just before a script exits, and after the context manager has
     exited (at least, I think that is why it lets exceptions through).
 
+
     References
     ----------
     https://github.com/facebook/prophet/issues/223
diff --git a/sktime/forecasting/base/adapters/_pmdarima.py b/sktime/forecasting/base/adapters/_pmdarima.py
index acc84fb5bd1..58e0a38561c 100644
--- a/sktime/forecasting/base/adapters/_pmdarima.py
+++ b/sktime/forecasting/base/adapters/_pmdarima.py
@@ -1,6 +1,7 @@
-#!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# !/usr/bin/env python3 -u
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements adapter for pmdarima forecasters to be used in sktime framework."""
 
 __author__ = ["Markus Löning", "Hongyi Yang"]
 __all__ = ["_PmdArimaAdapter"]
@@ -39,6 +40,7 @@ def _fit(self, y, X=None, fh=None, **fit_params):
             The forecasters horizon with the steps ahead to to predict.
         X : pd.DataFrame, optional (default=None)
             Exogenous variables are ignored
+
         Returns
         -------
         self : returns an instance of self.
diff --git a/sktime/forecasting/base/adapters/_statsmodels.py b/sktime/forecasting/base/adapters/_statsmodels.py
index c0002f24a63..19b685393f7 100644
--- a/sktime/forecasting/base/adapters/_statsmodels.py
+++ b/sktime/forecasting/base/adapters/_statsmodels.py
@@ -1,5 +1,7 @@
-#!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# !/usr/bin/env python3 -u
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements adapter for statsmodels forecasters to be used in sktime framework."""
 
 __author__ = ["Markus Löning"]
 __all__ = ["_StatsModelsAdapter"]
@@ -12,7 +14,7 @@
 
 
 class _StatsModelsAdapter(BaseForecaster):
-    """Base class for interfacing statsmodels forecasting algorithms"""
+    """Base class for interfacing statsmodels forecasting algorithms."""
 
     _fitted_param_names = ()
     _tags = {
@@ -37,6 +39,7 @@ def _fit(self, y, X=None, fh=None):
             The forecasters horizon with the steps ahead to to predict.
         X : pd.DataFrame, optional (default=None)
             Exogenous variables are ignored
+
         Returns
         -------
         self : returns an instance of self.
@@ -49,12 +52,11 @@ def _fit(self, y, X=None, fh=None):
         return self
 
     def _fit_forecaster(self, y_train, X_train=None):
-        """Internal fit"""
+        """Log used internally in fit."""
         raise NotImplementedError("abstract method")
 
     def _predict(self, fh, X=None, return_pred_int=False, alpha=DEFAULT_ALPHA):
-        """
-        Make forecasts.
+        """Make forecasts.
 
         Parameters
         ----------
@@ -85,7 +87,7 @@ def _predict(self, fh, X=None, return_pred_int=False, alpha=DEFAULT_ALPHA):
         return y_pred.loc[fh.to_absolute(self.cutoff).to_pandas()]
 
     def get_fitted_params(self):
-        """Get fitted parameters
+        """Get fitted parameters.
 
         Returns
         -------
@@ -101,7 +103,7 @@ def get_fitted_params(self):
         return fitted_params
 
     def _get_fitted_param_names(self):
-        """Get names of fitted parameters"""
+        """Get names of fitted parameters."""
         return self._fitted_param_names
 
 
diff --git a/sktime/forecasting/base/adapters/_tbats.py b/sktime/forecasting/base/adapters/_tbats.py
index 80972420919..9afac80677f 100644
--- a/sktime/forecasting/base/adapters/_tbats.py
+++ b/sktime/forecasting/base/adapters/_tbats.py
@@ -1,5 +1,7 @@
-#!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# !/usr/bin/env python3 -u
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements adapter for using tbats forecasters in sktime framework."""
 
 __author__ = ["Markus Löning", "Martin Walter"]
 __all__ = ["_TbatsAdapter"]
diff --git a/sktime/forecasting/bats.py b/sktime/forecasting/bats.py
index fae387d4a08..97efd962615 100644
--- a/sktime/forecasting/bats.py
+++ b/sktime/forecasting/bats.py
@@ -1,6 +1,12 @@
-#!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# !/usr/bin/env python3 -u
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements BATS algorithm.
+
+BATS refers to Exponential smoothing state space model with Box-Cox
+transformation, ARMA errors, Trend and Seasonal components as described in
+De LIvera, Hyndman and Snyder (2011).
+"""
 
 __author__ = ["Martin Walter"]
 __all__ = ["BATS"]
@@ -53,8 +59,8 @@ class BATS(_TbatsAdapter):
     context: abstract.ContextInterface, optional (default=None)
         For advanced users only. Provide this to override default behaviors
 
-    Example
-    ----------
+    Examples
+    --------
     >>> from sktime.datasets import load_airline
     >>> from sktime.forecasting.bats import BATS
     >>> y = load_airline()
diff --git a/sktime/forecasting/compose/__init__.py b/sktime/forecasting/compose/__init__.py
index e34f4e05f87..7cbb989ac2c 100644
--- a/sktime/forecasting/compose/__init__.py
+++ b/sktime/forecasting/compose/__init__.py
@@ -1,8 +1,9 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
-"""copyright: sktime developers, BSD-3-Clause License (see LICENSE file)."""
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements composite forecasters."""
 
-__author__ = ["Markus Löning"]
+__author__ = ["mloning"]
 
 __all__ = [
     "ColumnEnsembleForecaster",
diff --git a/sktime/forecasting/compose/_ensemble.py b/sktime/forecasting/compose/_ensemble.py
index 70abc963921..5e24823fd09 100644
--- a/sktime/forecasting/compose/_ensemble.py
+++ b/sktime/forecasting/compose/_ensemble.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements forecaster for creating forecasts from ensembles of other forecasters."""
 
 __author__ = ["Markus Löning"]
 __all__ = ["EnsembleForecaster"]
@@ -30,8 +31,8 @@ class EnsembleForecaster(_HeterogenousEnsembleForecaster):
     aggfunc : str, {'mean', 'median', 'min', 'max'}, default='mean'
         The function to aggregate prediction from individual forecasters.
 
-    Example
-    -------
+    Examples
+    --------
     >>> from sktime.forecasting.compose import EnsembleForecaster
     >>> from sktime.forecasting.naive import NaiveForecaster
     >>> from sktime.forecasting.trend import PolynomialTrendForecaster
diff --git a/sktime/forecasting/compose/_multiplexer.py b/sktime/forecasting/compose/_multiplexer.py
index 2b086392187..c8c5f00fa0a 100644
--- a/sktime/forecasting/compose/_multiplexer.py
+++ b/sktime/forecasting/compose/_multiplexer.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements forecaster for selecting among different model classes."""
 
 from sktime.forecasting.base._meta import _HeterogenousEnsembleForecaster
 from sktime.forecasting.base._base import DEFAULT_ALPHA
@@ -11,7 +12,7 @@
 
 
 class MultiplexForecaster(_HeterogenousEnsembleForecaster):
-    """MultiplexForecaster for model selection.
+    """MultiplexForecaster for selecting among different models.
 
     MultiplexForecaster facilitates a framework for performing
     model selection process over different model classes.
diff --git a/sktime/forecasting/compose/_pipeline.py b/sktime/forecasting/compose/_pipeline.py
index 72807ea3e94..c1f626ccce3 100644
--- a/sktime/forecasting/compose/_pipeline.py
+++ b/sktime/forecasting/compose/_pipeline.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements pipelines for forecasting."""
 
 __author__ = ["Markus Löning", "Martin Walter"]
 __all__ = ["TransformedTargetForecaster", "ForecastingPipeline"]
@@ -130,20 +131,19 @@ def set_params(self, **kwargs):
 
 
 class ForecastingPipeline(_Pipeline):
-    """Meta-estimator for forecasting with exogenous data.
+    """Pipeline for forecasting with exogenous data.
 
-    ForecastingPipeline is apply transformers to the exogenous serieses.
-    The given forecaster as last step can also be a TransformedTargetForecaster
-    containing transformers to transform y. ForecastingPipeline is only applying
-    the given transformers to X.
+    ForecastingPipeline is only applying the given transformers
+    to X. The forecaster can also be a TransformedTargetForecaster containing
+    transformers to transform y.
 
     Parameters
     ----------
     steps : list
         List of tuples like ("name", forecaster/transformer)
 
-    Example
-    -------
+    Examples
+    --------
     >>> from sktime.datasets import load_longley
     >>> from sktime.forecasting.naive import NaiveForecaster
     >>> from sktime.forecasting.compose import ForecastingPipeline
@@ -187,6 +187,7 @@ def _fit(self, y, X=None, fh=None):
             The forecasters horizon with the steps ahead to to predict.
         X : pd.DataFrame, required
             Exogenous variables are ignored
+
         Returns
         -------
         self : returns an instance of self.
@@ -295,8 +296,8 @@ class TransformedTargetForecaster(_Pipeline, _SeriesToSeriesTransformer):
     steps : list
         List of tuples like ("name", forecaster/transformer)
 
-    Example
-    -------
+    Examples
+    --------
     >>> from sktime.datasets import load_airline
     >>> from sktime.forecasting.naive import NaiveForecaster
     >>> from sktime.forecasting.compose import TransformedTargetForecaster
@@ -335,6 +336,7 @@ def _fit(self, y, X=None, fh=None):
             The forecasters horizon with the steps ahead to to predict.
         X : pd.DataFrame, optional (default=None)
             Exogenous variables are ignored
+
         Returns
         -------
         self : returns an instance of self.
@@ -417,18 +419,19 @@ def _update(self, y, X=None, update_params=True):
         return self
 
     def transform(self, Z, X=None):
-        """Transform data.
-
-        Returns a transformed version of Z.
+        """Return transformed version of input series `Z`.
 
         Parameters
         ----------
-        Z : pd.Series, pd.DataFrame
+        Z : pd.Series or pd.DataFrame
+            A time series to apply the transformation on.
+        X : pd.DataFrame, default=None
+            Exogenous data used in transformation.
 
         Returns
         -------
-        Z : pd.Series, pd.DataFrame
-            Transformed time series(es).
+        Zt : pd.Series or pd.DataFrame
+            Transformed version of input series `Z`.
         """
         self.check_is_fitted()
         zt = check_series(Z, enforce_univariate=True)
@@ -443,6 +446,8 @@ def inverse_transform(self, Z, X=None):
         ----------
         Z : pd.Series or pd.DataFrame
             A time series to reverse the transformation on.
+        X : pd.DataFrame, default=None
+            Exogenous data used in transformation.
 
         Returns
         -------
diff --git a/sktime/forecasting/compose/_reduce.py b/sktime/forecasting/compose/_reduce.py
index 168bd9f79d9..fd22ab7bb96 100644
--- a/sktime/forecasting/compose/_reduce.py
+++ b/sktime/forecasting/compose/_reduce.py
@@ -566,8 +566,7 @@ def _predict_last_window(
 
 
 class DirectTabularRegressionForecaster(_DirectReducer):
-    """
-    Direct reduction from forecasting to tabular regression.
+    """Direct reduction from forecasting to tabular regression.
 
     For the direct reduction strategy, a separate forecaster is fitted
     for each step ahead of the forecasting horizon.
@@ -585,8 +584,7 @@ class DirectTabularRegressionForecaster(_DirectReducer):
 
 
 class MultioutputTabularRegressionForecaster(_MultioutputReducer):
-    """
-    Multioutput reduction from forecasting to tabular regression.
+    """Multioutput reduction from forecasting to tabular regression.
 
     For the multioutput strategy, a single estimator capable of handling multioutput
     targets is fitted to all the future steps in the forecasting horizon.
@@ -604,8 +602,7 @@ class MultioutputTabularRegressionForecaster(_MultioutputReducer):
 
 
 class RecursiveTabularRegressionForecaster(_RecursiveReducer):
-    """
-    Recursive reduction from forecasting to tabular regression.
+    """Recursive reduction from forecasting to tabular regression.
 
     For the recursive strategy, a single estimator is fit for a one-step-ahead
     forecasting horizon and then called iteratively to predict multiple steps ahead.
@@ -623,8 +620,7 @@ class RecursiveTabularRegressionForecaster(_RecursiveReducer):
 
 
 class DirRecTabularRegressionForecaster(_DirRecReducer):
-    """
-    Dir-rec reduction from forecasting to tabular regression.
+    """Dir-rec reduction from forecasting to tabular regression.
 
     For the hybrid dir-rec strategy, a separate forecaster is fitted
     for each step ahead of the forecasting horizon and then
@@ -645,8 +641,7 @@ class DirRecTabularRegressionForecaster(_DirRecReducer):
 
 
 class DirectTimeSeriesRegressionForecaster(_DirectReducer):
-    """
-    Direct reduction from forecasting to time-series regression.
+    """Direct reduction from forecasting to time-series regression.
 
     For the direct reduction strategy, a separate forecaster is fitted
     for each step ahead of the forecasting horizon.
@@ -664,8 +659,7 @@ class DirectTimeSeriesRegressionForecaster(_DirectReducer):
 
 
 class MultioutputTimeSeriesRegressionForecaster(_MultioutputReducer):
-    """
-    Multioutput reduction from forecasting to time series regression.
+    """Multioutput reduction from forecasting to time series regression.
 
     For the multioutput strategy, a single estimator capable of handling multioutput
     targets is fitted to all the future steps in the forecasting horizon.
@@ -683,8 +677,7 @@ class MultioutputTimeSeriesRegressionForecaster(_MultioutputReducer):
 
 
 class RecursiveTimeSeriesRegressionForecaster(_RecursiveReducer):
-    """
-    Recursive reduction from forecasting to time series regression.
+    """Recursive reduction from forecasting to time series regression.
 
     For the recursive strategy, a single estimator is fit for a one-step-ahead
     forecasting horizon and then called iteratively to predict multiple steps ahead.
@@ -702,8 +695,7 @@ class RecursiveTimeSeriesRegressionForecaster(_RecursiveReducer):
 
 
 class DirRecTimeSeriesRegressionForecaster(_DirRecReducer):
-    """
-    Dir-rec reduction from forecasting to time-series regression.
+    """Dir-rec reduction from forecasting to time-series regression.
 
     For the hybrid dir-rec strategy, a separate forecaster is fitted
     for each step ahead of the forecasting horizon and then
@@ -727,8 +719,7 @@ class DirRecTimeSeriesRegressionForecaster(_DirRecReducer):
 def ReducedForecaster(
     estimator, scitype="infer", strategy="recursive", window_length=10, step_length=1
 ):
-    """
-    Reduction from forecasting to tabular or time series regression.
+    """Reduction from forecasting to tabular or time series regression.
 
     During fitting, a sliding-window approach is used to first transform the
     time series into tabular or panel data, which is then used to fit a tabular or
@@ -765,8 +756,7 @@ def ReducedForecaster(
 def ReducedRegressionForecaster(
     estimator, scitype, strategy="recursive", window_length=10, step_length=1
 ):
-    """
-    Reduction from forecasting to tabular or time series regression.
+    """Reduction from forecasting to tabular or time series regression.
 
     During fitting, a sliding-window approach is used to first transform the
     time series into tabular or panel data, which is then used to fit a tabular or
@@ -804,8 +794,7 @@ def make_reduction(
     window_length=10,
     scitype="infer",
 ):
-    """
-    Make forecaster based on reduction to tabular or time-series regression.
+    """Make forecaster based on reduction to tabular or time-series regression.
 
     During fitting, a sliding-window approach is used to first transform the
     time series into tabular or panel data, which is then used to fit a tabular or
@@ -825,6 +814,7 @@ def make_reduction(
     scitype : str, optional (default="infer")
         Must be one of "infer", "tabular-regressor" or "time-series-regressor". If
         the scitype cannot be inferred, please specify it explicitly.
+        See :term:`scitype`.
 
     Returns
     -------
diff --git a/sktime/forecasting/compose/_stack.py b/sktime/forecasting/compose/_stack.py
index c3ee95dce5b..bdd017fab13 100644
--- a/sktime/forecasting/compose/_stack.py
+++ b/sktime/forecasting/compose/_stack.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements forecasters for combining forecasts via stacking."""
 
 __author__ = ["Markus Löning"]
 __all__ = ["StackingForecaster"]
@@ -20,7 +21,7 @@
 class StackingForecaster(_HeterogenousEnsembleForecaster):
     """StackingForecaster.
 
-    Stacks two or more Forecasters
+    Stacks two or more Forecasters.
 
     Parameters
     ----------
@@ -55,6 +56,7 @@ def _fit(self, y, X=None, fh=None):
             The forecasters horizon with the steps ahead to to predict.
         X : pd.DataFrame, optional (default=None)
             Exogenous variables are ignored
+
         Returns
         -------
         self : returns an instance of self.
@@ -86,8 +88,7 @@ def _fit(self, y, X=None, fh=None):
         return self
 
     def _update(self, y, X=None, update_params=True):
-
-        """Update fitted parameters
+        """Update fitted parameters.
 
         Parameters
         ----------
diff --git a/sktime/forecasting/croston.py b/sktime/forecasting/croston.py
index 5dc48270c84..d7042e4583a 100644
--- a/sktime/forecasting/croston.py
+++ b/sktime/forecasting/croston.py
@@ -1,6 +1,9 @@
 # -*- coding: utf-8 -*-
+# !/usr/bin/env python3 -u
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
 """Croston's Forecasting Method."""
 
+
 import numpy as np
 import pandas as pd
 from sktime.forecasting.base import BaseForecaster
@@ -8,14 +11,14 @@
 
 
 class Croston(BaseForecaster):
-    """Croston's Forecasting Method.
+    """Croston's method for forecasting intermittent demand.
 
-    This was designed for forecasting intermittent demand.
+    Implements method proposed by Croston in [1]_ and described in [2]_.
 
     Parameters
     ----------
     smoothing : float, default = 0.1
-        Smoothing parameter
+        Smoothing parameter.
 
     Examples
     --------
@@ -29,10 +32,10 @@ class Croston(BaseForecaster):
 
     References
     ----------
-    [1]  J. D. Croston. Forecasting and stock control for intermittent demands.
-        Operational Research Quarterly (1970-1977), 23(3):pp. 289–303, 1972.
-    [2]  Forecasting: Principles and Practice,
-        Otext book by Rob J Hyndman and George Athanasopoulos
+    ..[1] J. D. Croston. Forecasting and stock control for intermittent demands.
+      Operational Research Quarterly (1970-1977), 23(3):pp. 289–303, 1972.
+    ..[2] Forecasting: Principles and Practice,
+      Otext book by Rob J Hyndman and George Athanasopoulos
     """
 
     _tags = {
@@ -55,7 +58,8 @@ def _fit(self, y, X=None, fh=None):
         fh : int, list or np.array, optional (default=None)
             The forecasters horizon with the steps ahead to to predict.
         X : pd.DataFrame, optional (default=None)
-            Exogenous variables are ignored
+            Exogenous variables are ignored.
+
         Returns
         -------
         self : returns an instance of self.
@@ -104,11 +108,12 @@ def _predict(
         fh : int, list or np.array, optional (default=None)
             The forecasters horizon with the steps ahead to to predict.
         X : pd.DataFrame, optional (default=None)
-            Exogenous variables are ignored
+            Exogenous variables are ignored.
+
         Returns
         -------
         forecast : pd.series
-                   predicted forecasts
+            Predicted forecasts.
         """
         len_fh = len(self.fh)
         f = self._f
diff --git a/sktime/forecasting/ets.py b/sktime/forecasting/ets.py
index 7eb21a15980..e02a92cc2d9 100644
--- a/sktime/forecasting/ets.py
+++ b/sktime/forecasting/ets.py
@@ -1,4 +1,8 @@
 # -*- coding: utf-8 -*-
+# !/usr/bin/env python3 -u
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements automatic and manually exponential time series smoothing models."""
+
 __all__ = ["AutoETS"]
 __author__ = ["Hongyi Yang"]
 
@@ -10,8 +14,7 @@
 
 
 class AutoETS(_StatsModelsAdapter):
-    """
-    ETS models with both manual and automatic fitting capabilities.
+    """ETS models with both manual and automatic fitting capabilities.
 
     Manual fitting is adapted from statsmodels' version,
     while automatic fitting is adapted from R version of ets.
@@ -146,12 +149,12 @@ class AutoETS(_StatsModelsAdapter):
 
     References
     ----------
-    [1] Hyndman, R.J., & Athanasopoulos, G. (2019) *Forecasting:
-        principles and practice*, 3rd edition, OTexts: Melbourne,
-        Australia. OTexts.com/fpp3. Accessed on April 19th 2020.
+    .. [1] Hyndman, R.J., & Athanasopoulos, G. (2019) *Forecasting:
+       principles and practice*, 3rd edition, OTexts: Melbourne,
+       Australia. OTexts.com/fpp3. Accessed on April 19th 2020.
 
-    Example
-    ----------
+    Examples
+    --------
     >>> from sktime.datasets import load_airline
     >>> from sktime.forecasting.ets import AutoETS
     >>> y = load_airline()
@@ -360,9 +363,9 @@ def _fit(error, trend, seasonal, damped):
             )
 
     def summary(self):
-        """
-        Get a summary of the fitted forecaster,
-        same as the implementation in statsmodels:
+        """Get a summary of the fitted forecaster.
+
+        This is the same as the implementation in statsmodels:
         https://www.statsmodels.org/dev/examples/notebooks/generated/ets.html
         """
         return self._fitted_forecaster.summary()
diff --git a/sktime/forecasting/exp_smoothing.py b/sktime/forecasting/exp_smoothing.py
index 5d0ba2c7990..247a678825e 100644
--- a/sktime/forecasting/exp_smoothing.py
+++ b/sktime/forecasting/exp_smoothing.py
@@ -1,4 +1,8 @@
 # -*- coding: utf-8 -*-
+# !/usr/bin/env python3 -u
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements Holt-Winters exponential smoothing."""
+
 __all__ = ["ExponentialSmoothing"]
 __author__ = ["Markus Löning", "@big-o"]
 
@@ -8,10 +12,10 @@
 
 
 class ExponentialSmoothing(_StatsModelsAdapter):
-    """
-    Holt-Winters exponential smoothing forecaster. Default settings use
-    simple exponential smoothing
-    without trend and seasonality components.
+    """Holt-Winters exponential smoothing forecaster.
+
+    Default settings use simple exponential smoothing without trend and
+    seasonality components.
 
     Parameters
     ----------
@@ -43,8 +47,8 @@ class ExponentialSmoothing(_StatsModelsAdapter):
     [1] Hyndman, Rob J., and George Athanasopoulos. Forecasting: principles
         and practice. OTexts, 2014.
 
-    Example
-    ----------
+    Examples
+    --------
     >>> from sktime.datasets import load_airline
     >>> from sktime.forecasting.exp_smoothing import ExponentialSmoothing
     >>> y = load_airline()
diff --git a/sktime/forecasting/fbprophet.py b/sktime/forecasting/fbprophet.py
index 0f58c3e1f38..4c1ea8d696d 100644
--- a/sktime/forecasting/fbprophet.py
+++ b/sktime/forecasting/fbprophet.py
@@ -1,8 +1,7 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
-
-"""Prophet forecaster by wrapping fbprophet."""
+"""Implements Prophet forecaster by wrapping fbprophet."""
 
 __author__ = ["Martin Walter"]
 __all__ = ["Prophet"]
@@ -85,8 +84,8 @@ class Prophet(_ProphetAdapter):
     https://facebook.github.io/prophet
     https://github.com/facebook/prophet
 
-    Example
-    ----------
+    Examples
+    --------
     >>> from sktime.datasets import load_airline
     >>> from sktime.forecasting.fbprophet import Prophet
     >>> # Prophet requires to have data with a pandas.DatetimeIndex
diff --git a/sktime/forecasting/hcrystalball.py b/sktime/forecasting/hcrystalball.py
index 64d151a1d6d..5d4d131fa3b 100644
--- a/sktime/forecasting/hcrystalball.py
+++ b/sktime/forecasting/hcrystalball.py
@@ -1,4 +1,8 @@
 # -*- coding: utf-8 -*-
+# !/usr/bin/env python3 -u
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements wrapper for using HCrystalBall forecastsers in sktime."""
+
 import pandas as pd
 from sklearn.base import clone
 
@@ -27,7 +31,7 @@ def _check_index(index):
 
 
 def _adapt_y_X(y, X):
-    """Adapt fit data to HCB compliant format
+    """Adapt fit data to HCB compliant format.
 
     Parameters
     ----------
@@ -54,7 +58,7 @@ def _adapt_y_X(y, X):
 
 
 def _get_X_pred(X_pred, index):
-    """Translate forecast horizon interface to HCB native dataframe
+    """Translate forecast horizon interface to HCB native dataframe.
 
     Parameters
     ----------
@@ -77,9 +81,9 @@ def _get_X_pred(X_pred, index):
 
 
 def _adapt_y_pred(y_pred):
-    """Translate wrapper prediction to sktime format
+    """Translate wrapper prediction to sktime format.
 
-    From Dataframe to series
+    From Dataframe to series.
 
     Parameters
     ----------
@@ -94,6 +98,13 @@ def _adapt_y_pred(y_pred):
 
 
 class HCrystalBallForecaster(BaseForecaster):
+    """Implement wrapper to allow use of HCrystalBall forecasters in sktime.
+
+    Parameters
+    ----------
+    model :
+        The HCrystalBall forecasting model to use.
+    """
 
     _tags = {
         "univariate-only": True,
@@ -121,7 +132,6 @@ def _fit(self, y, X=None, fh=None):
         -------
         self : returns an instance of self.
         """
-
         y, X = _adapt_y_X(y, X)
         self.model_ = clone(self.model)
         self.model_.fit(X, y)
@@ -129,7 +139,7 @@ def _fit(self, y, X=None, fh=None):
         return self
 
     def _predict(self, fh=None, X=None, return_pred_int=False, alpha=DEFAULT_ALPHA):
-        """Make forecasts for the given forecast horizon
+        """Make forecasts for the given forecast horizon.
 
         Parameters
         ----------
@@ -156,6 +166,7 @@ def _predict(self, fh=None, X=None, return_pred_int=False, alpha=DEFAULT_ALPHA):
         return _adapt_y_pred(y_pred)
 
     def get_fitted_params(self):
+        """Get fitted parameters."""
         raise NotImplementedError()
 
     def _compute_pred_err(self, alphas):
diff --git a/sktime/forecasting/model_evaluation/__init__.py b/sktime/forecasting/model_evaluation/__init__.py
index 565e5135589..68a87478e4f 100644
--- a/sktime/forecasting/model_evaluation/__init__.py
+++ b/sktime/forecasting/model_evaluation/__init__.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements functionality to evaluate forecasting models."""
 
 __author__ = ["Martin Walter"]
 __all__ = ["evaluate"]
diff --git a/sktime/forecasting/model_evaluation/_functions.py b/sktime/forecasting/model_evaluation/_functions.py
index eb9d9a0281c..53573d342ee 100644
--- a/sktime/forecasting/model_evaluation/_functions.py
+++ b/sktime/forecasting/model_evaluation/_functions.py
@@ -1,4 +1,7 @@
+#!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements functions to be used in evaluating forecasting models."""
 
 __author__ = ["Martin Walter", "Markus Löning"]
 __all__ = ["evaluate"]
@@ -49,13 +52,13 @@ def evaluate(
         y_pred, y_test.
 
     Returns
-    ----------
+    -------
     pd.DataFrame
         DataFrame that contains several columns with information regarding each
         refit/update and prediction of the forecaster.
 
-    Example
-    -------
+    Examples
+    --------
     >>> from sktime.datasets import load_airline
     >>> from sktime.forecasting.model_evaluation import evaluate
     >>> from sktime.forecasting.model_selection import ExpandingWindowSplitter
diff --git a/sktime/forecasting/model_selection/__init__.py b/sktime/forecasting/model_selection/__init__.py
index 71f191ce74c..a6c9fd9ed42 100644
--- a/sktime/forecasting/model_selection/__init__.py
+++ b/sktime/forecasting/model_selection/__init__.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements functionality for selecting forecasting models."""
 
 __author__ = ["Markus Löning", "Kutay Koralturk"]
 __all__ = [
diff --git a/sktime/forecasting/model_selection/_split.py b/sktime/forecasting/model_selection/_split.py
index 64e983e1f0a..5ecc900551e 100644
--- a/sktime/forecasting/model_selection/_split.py
+++ b/sktime/forecasting/model_selection/_split.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implement dataset splitting for model evaluation and seleciton."""
 
 __all__ = [
     "ExpandingWindowSplitter",
@@ -34,7 +35,7 @@
 
 
 def _repr(self):
-    """Helper function to build repr for splitters similar to estimator objects"""
+    """Build repr for splitters similar to estimator objects."""
     # This is copied from scikit-learn's BaseEstimator get_params method
     cls = self.__class__
     init = getattr(cls.__init__, "deprecated_original", cls.__init__)
@@ -96,20 +97,19 @@ def has_changed(k, v):
 
 
 def _check_y(y):
-    """Check input to `split` function"""
+    """Check input to `split` function."""
     if isinstance(y, pd.Series):
         y = y.index
     return check_time_index(y)
 
 
 def _check_fh(fh):
-    """Check and convert fh to format expected by CV splitters"""
+    """Check and convert fh to format expected by CV splitters."""
     return check_fh(fh, enforce_relative=True)
 
 
 def _get_end(y, fh):
-    """Compute the end of the last training window for a given and forecasting
-    horizon."""
+    """Compute the end of the last training window for a forecasting horizon."""
     # `fh` is assumed to be ordered and checked by `_check_fh` and `window_length` by
     # `check_window_length`.
     n_timepoints = y.shape[0]
@@ -182,7 +182,7 @@ def split(self, y):
             yield train[train >= 0], test[test >= 0]
 
     def _split(self, y):
-        """Internal split method implemented by concrete classes"""
+        """Split method containing internal logic implemented by concrete classes."""
         raise NotImplementedError("abstract method")
 
     def get_n_splits(self, y=None):
@@ -216,7 +216,7 @@ def get_cutoffs(self, y=None):
         raise NotImplementedError("abstract method")
 
     def get_fh(self):
-        """Return the forecasting horizon
+        """Return the forecasting horizon.
 
         Returns
         -------
@@ -266,16 +266,16 @@ def _split(self, y):
             yield training_window, test_window
 
     def get_n_splits(self, y=None):
-        """Return the number of splits"""
+        """Return the number of splits."""
         return len(self.cutoffs)
 
     def get_cutoffs(self, y=None):
-        """Return the cutoff points"""
+        """Return the cutoff points."""
         return check_cutoffs(self.cutoffs)
 
 
 class BaseWindowSplitter(BaseSplitter):
-    """Base class for sliding and expanding window splitter"""
+    """Base class for sliding and expanding window splitter."""
 
     def __init__(
         self,
@@ -333,12 +333,11 @@ def _split(self, y):
 
     @staticmethod
     def _split_windows(start, end, step_length, window_length, fh):
-        """Abstract method implemented by concrete classes for sliding and expanding
-        windows"""
+        """Abstract method for sliding/expanding windows."""
         raise NotImplementedError("abstract method")
 
     def _get_start(self, fh):
-        """Get the first split point"""
+        """Get the first split point."""
         # By default, the first split point is the index zero, the first
         # observation in
         # the data.
@@ -369,7 +368,7 @@ def _get_start(self, fh):
         return start
 
     def get_n_splits(self, y=None):
-        """Return number of splits
+        """Return number of splits.
 
         Parameters
         ----------
@@ -467,7 +466,7 @@ def __init__(
 
     @staticmethod
     def _split_windows(start, end, step_length, window_length, fh):
-        """Sliding windows"""
+        """Generate sliding windows."""
         for split_point in range(start, end, step_length):
             train = np.arange(split_point - window_length, split_point)
             test = split_point + fh - 1
@@ -526,7 +525,7 @@ def __init__(
 
     @staticmethod
     def _split_windows(start, end, step_length, window_length, fh):
-        """Expanding windows"""
+        """Generate expanding windows."""
         for split_point in range(start, end, step_length):
             train = np.arange(start - window_length, split_point)
             test = split_point + fh - 1
@@ -594,7 +593,8 @@ def get_cutoffs(self, y=None):
 
 
 def temporal_train_test_split(y, X=None, test_size=None, train_size=None, fh=None):
-    """Split arrays or matrices into sequential train and test subsets
+    """Split arrays or matrices into sequential train and test subsets.
+
     Creates train/test splits over endogenous arrays an optional exogenous
     arrays.
 
@@ -648,8 +648,10 @@ def temporal_train_test_split(y, X=None, test_size=None, train_size=None, fh=Non
 
 
 def _split_by_fh(y, fh, X=None):
-    """Helper function to split time series with forecasting horizon handling both
-    relative and absolute horizons"""
+    """Split time series with forecasting horizon.
+
+    Handles both relative and absolute horizons.
+    """
     if X is not None:
         check_equal_time_index(y, X)
     fh = check_fh(fh)
diff --git a/sktime/forecasting/model_selection/_tune.py b/sktime/forecasting/model_selection/_tune.py
index d8102be22b4..6df57153575 100644
--- a/sktime/forecasting/model_selection/_tune.py
+++ b/sktime/forecasting/model_selection/_tune.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements grid search functionality to tune forecasters."""
 
 __author__ = ["Markus Löning"]
 __all__ = ["ForecastingGridSearchCV", "ForecastingRandomizedSearchCV"]
@@ -221,6 +222,7 @@ def _fit(self, y, X=None, fh=None, **fit_params):
             The forecasters horizon with the steps ahead to to predict.
         X : pd.DataFrame, optional (default=None)
             Exogenous variables are ignored
+
         Returns
         -------
         self : returns an instance of self.
diff --git a/sktime/forecasting/naive.py b/sktime/forecasting/naive.py
index 75c747f15e8..1251423f01e 100644
--- a/sktime/forecasting/naive.py
+++ b/sktime/forecasting/naive.py
@@ -1,6 +1,7 @@
-#!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# !/usr/bin/env python3 -u
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements simple forecasts based on naive assumptions."""
 
 __all__ = ["NaiveForecaster"]
 __author__ = ["Markus Löning", "Piyush Gade"]
@@ -16,7 +17,8 @@
 
 
 class NaiveForecaster(_BaseWindowForecaster):
-    """
+    """Forecast based on naive assumptions about past trends continuing.
+
     NaiveForecaster is a forecaster that makes forecasts using simple
     strategies.
 
@@ -47,8 +49,8 @@ class NaiveForecaster(_BaseWindowForecaster):
         Window length to use in the `mean` strategy. If None, entire training
             series will be used.
 
-    Example
-    ----------
+    Examples
+    --------
     >>> from sktime.datasets import load_airline
     >>> from sktime.forecasting.naive import NaiveForecaster
     >>> y = load_airline()
@@ -155,7 +157,7 @@ def _fit(self, y, X=None, fh=None):
     def _predict_last_window(
         self, fh, X=None, return_pred_int=False, alpha=DEFAULT_ALPHA
     ):
-        """Internal predict"""
+        """Calculate predictions for use in predict."""
         last_window, _ = self._get_last_window()
         fh = fh.to_relative(self.cutoff)
 
diff --git a/sktime/forecasting/online_learning/__init__.py b/sktime/forecasting/online_learning/__init__.py
index 38a63f302e8..79f8e03f9a9 100644
--- a/sktime/forecasting/online_learning/__init__.py
+++ b/sktime/forecasting/online_learning/__init__.py
@@ -1,6 +1,7 @@
-#!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# !/usr/bin/env python3 -u
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implments algorithms for creating online ensembles of forecasters."""
 
 __author__ = ["William Zheng"]
 
diff --git a/sktime/forecasting/online_learning/_online_ensemble.py b/sktime/forecasting/online_learning/_online_ensemble.py
index 31ab2526bea..e28b95a7380 100644
--- a/sktime/forecasting/online_learning/_online_ensemble.py
+++ b/sktime/forecasting/online_learning/_online_ensemble.py
@@ -1,4 +1,8 @@
 # -*- coding: utf-8 -*-
+# !/usr/bin/env python3 -u
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements framework for applying online ensembling algorithms to forecasters."""
+
 import numpy as np
 import pandas as pd
 
@@ -10,7 +14,7 @@
 
 
 class OnlineEnsembleForecaster(EnsembleForecaster):
-    """Online Updating Ensemble of forecasters
+    """Online Updating Ensemble of forecasters.
 
     Parameters
     ----------
@@ -47,19 +51,22 @@ def _fit(self, y, X=None, fh=None):
             The forecasters horizon with the steps ahead to to predict.
         X : pd.DataFrame, optional (default=None)
             Exogenous variables are ignored
+
         Returns
         -------
         self : returns an instance of self.
         """
-
         names, forecasters = self._check_forecasters()
         self.weights = np.ones(len(forecasters)) / len(forecasters)
         self._fit_forecasters(forecasters, y, X, fh)
         return self
 
     def _fit_ensemble(self, y, X=None):
-        """Fits the ensemble by allowing forecasters to predict and
-           compares to the actual parameters.
+        """Fit the ensemble.
+
+        This makes predictions with individual forecasters and compares the
+        results to actual values. This is then used to update ensemble
+        weights.
 
         Parameters
         ----------
@@ -87,7 +94,6 @@ def _update(self, y, X=None, update_params=False):
         -------
         self : an instance of self
         """
-
         if len(y) >= 1 and self.ensemble_algorithm is not None:
             self._fit_ensemble(y, X)
 
diff --git a/sktime/forecasting/online_learning/_prediction_weighted_ensembler.py b/sktime/forecasting/online_learning/_prediction_weighted_ensembler.py
index 0cbd4b99884..1c2d5eaaf11 100644
--- a/sktime/forecasting/online_learning/_prediction_weighted_ensembler.py
+++ b/sktime/forecasting/online_learning/_prediction_weighted_ensembler.py
@@ -1,13 +1,18 @@
 # -*- coding: utf-8 -*-
+# !/usr/bin/env python3 -u
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements online algorithms for prediction weighted ensembles."""
+
 import numpy as np
 from scipy.optimize import bisect
 from scipy.optimize import nnls
 
 
 class _PredictionWeightedEnsembler:
-    """Wrapper class to handle ensemble algorithms that use multiple forecasters
-    for prediction. We implement default methods for setting uniform weights,
-    updating and prediction.
+    """Wrapper class to handle ensemble algorithms that use multiple forecasters.
+
+    This implements default methods for setting uniform weights, updating
+    and prediction.
 
     Parameters
     ----------
@@ -30,8 +35,7 @@ def __init__(self, n_estimators=10, loss_func=None):
         super(_PredictionWeightedEnsembler, self).__init__()
 
     def _predict(self, y_pred):
-        """Performs prediction by taking a weighted average of the estimator
-            predictions w.r.t the weights vector
+        """Make predictions by taking weighted average of forecaster predictions.
 
         Parameters
         ----------
@@ -48,8 +52,7 @@ def _predict(self, y_pred):
         return prediction
 
     def _modify_weights(self, new_array):
-        """Performs a pointwise multiplication of the current
-        weights with a new array of weights.
+        """Multiply pointwise the current weights with a new array of weights.
 
         Parameters
         ----------
@@ -60,8 +63,10 @@ def _modify_weights(self, new_array):
         self.weights /= np.sum(self.weights)
 
     def _update(self, y_pred, y_true):
-        """Resets the weights over the estimators by passing previous observations
-            to the weighting algorithm
+        """Update fitted paramters and performs a new ensemble fit.
+
+        Resets the weights over the estimators by passing previous
+        observations to the weighting algorithm.
 
         Parameters
         ----------
@@ -73,7 +78,7 @@ def _update(self, y_pred, y_true):
         raise NotImplementedError()
 
     def _uniform_weights(self, n_estimators):
-        """Resets weights for n estimator to uniform weights
+        """Reset weights for n estimator to uniform weights.
 
         Parameters
         ----------
@@ -85,8 +90,10 @@ def _uniform_weights(self, n_estimators):
 
 
 class HedgeExpertEnsemble(_PredictionWeightedEnsembler):
-    """Wrapper class to set parameters for hedge-style ensemble algorithms with
-    a forecasting horizon and normalizing constant.
+    """Use hedge-style ensemble algorithms.
+
+    Wrapper for hedge-style ensemble algorithms with a forecasting horizon and
+    normalizing constant.
 
     Parameters
     ----------
@@ -115,7 +122,9 @@ def __init__(self, n_estimators=10, T=10, a=1, loss_func=None):
 
 
 class NormalHedgeEnsemble(HedgeExpertEnsemble):
-    """Implementation of A Parameter-free Hedging Algorithm,
+    """Parameter free hedging algorithm.
+
+    Implementation of A Parameter-free Hedging Algorithm,
     Kamalika Chaudhuri, Yoav Freund, Daniel Hsu (2009) as a hedge-style
     algorithm.
 
@@ -142,8 +151,10 @@ def __init__(self, n_estimators=10, a=1, loss_func=None):
         self.R = np.zeros(n_estimators)
 
     def update(self, y_pred, y_true, low_c=0.01):
-        """Resets the weights over the estimators by passing previous observations
-            and updating based on Normal Hedge.
+        """Update forecaster weights.
+
+        The weights are updated over the estimators by passing previous
+        observations and updating based on Normal Hedge.
 
         Parameters
         ----------
@@ -170,7 +181,9 @@ def update(self, y_pred, y_true, low_c=0.01):
             self._update_weights(low_c=low_c)
 
     def _update_weights(self, low_c=0.01):
-        """Updates the weights on each of the estimators by performing a potential
+        """Update forecaster weights.
+
+        Update the weights on each of the estimators by performing a potential
         function update with a root-finding search. low_c represents the lower
         bound on the window that the root finding is occuring over.
 
@@ -179,7 +192,6 @@ def _update_weights(self, low_c=0.01):
         low_c : float
             lowest value that c can take
         """
-
         # Calculating Normalizing Constant
         R_plus = np.array(list(map(lambda x: 0 if 0 > x else x, self.R)))
         normalizing_R = np.max(R_plus)
@@ -190,7 +202,7 @@ def _update_weights(self, low_c=0.01):
         high_c = (max(R_plus) ** 2) / 2
 
         def _pot(c):
-            """Internal Potential Function
+            """Calculate algorithm's potential Function.
 
             Parameters
             ----------
@@ -206,7 +218,7 @@ def _pot(c):
         c_t = bisect(_pot, low_c, high_c)
 
         def _prob(r, c_t):
-            """Internal Probability Function
+            """Calculate algorithm's probability Function.
 
             Parameters
             ----------
@@ -227,7 +239,9 @@ def _prob(r, c_t):
 
 
 class NNLSEnsemble(_PredictionWeightedEnsembler):
-    """Ensemble class that performs a non-negative least squares to fit to the
+    """Ensemble forecasts with Non-negative least squares based weighting.
+
+    Ensemble class that performs a non-negative least squares to fit to the
     estimators. Keeps track of all observations seen so far and fits to it.
 
     Parameters
@@ -250,6 +264,15 @@ def __init__(self, n_estimators=10, loss_func=None):
         self.total_y_true = np.empty(0)
 
     def update(self, y_pred, y_true):
+        """Update the online ensemble with new data.
+
+        Parameters
+        ----------
+        y_pred : np.array(), shape=(time_axis,estimator_axis)
+            array with predictions from the estimators
+        y_true : np.array(), shape=(time_axis)
+            array with actual values for predicted quantity
+        """
         self.total_y_pred = np.concatenate((self.total_y_pred, y_pred), axis=1)
         self.total_y_true = np.concatenate((self.total_y_true, y_true))
         weights, loss = nnls(self.total_y_pred.T, self.total_y_true)
diff --git a/sktime/forecasting/tbats.py b/sktime/forecasting/tbats.py
index 041b74be849..298d083e543 100644
--- a/sktime/forecasting/tbats.py
+++ b/sktime/forecasting/tbats.py
@@ -1,6 +1,12 @@
-#!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# !/usr/bin/env python3 -u
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements TBATS algorithm.
+
+TBATS refers to Exponential smoothing state space model with Box-Cox
+transformation, ARMA errors, Trigonometric Trend and Seasonal components as
+described in De LIvera, Hyndman and Snyder (2011).
+"""
 
 __author__ = ["Martin Walter"]
 __all__ = ["TBATS"]
@@ -53,8 +59,8 @@ class TBATS(_TbatsAdapter):
     context: abstract.ContextInterface, optional (default=None)
         For advanced users only. Provide this to override default behaviors
 
-    Example
-    ----------
+    Examples
+    --------
     >>> from sktime.datasets import load_airline
     >>> from sktime.forecasting.tbats import TBATS
     >>> y = load_airline()
diff --git a/sktime/forecasting/theta.py b/sktime/forecasting/theta.py
index 19a36544d8c..ef1eec9103d 100644
--- a/sktime/forecasting/theta.py
+++ b/sktime/forecasting/theta.py
@@ -1,5 +1,7 @@
 # -*- coding: utf-8 -*-
-"""Theta forecaster from statsmodels."""
+
+"""Theta forecaster."""
+
 __all__ = ["ThetaForecaster"]
 __author__ = ["big-o", "mloning"]
 
@@ -20,21 +22,16 @@ class ThetaForecaster(ExponentialSmoothing):
     """Theta method for forecasting.
 
     The theta method as defined in [1]_ is equivalent to simple exponential
-    smoothing
-    (SES) with drift. This is demonstrated in [2]_.
+    smoothing (SES) with drift (as demonstrated in [2]_).
 
     The series is tested for seasonality using the test outlined in A&N. If
-    deemed
-    seasonal, the series is seasonally adjusted using a classical
-    multiplicative
-    decomposition before applying the theta method. The resulting forecasts
-    are then
-    reseasonalised.
+    deemed seasonal, the series is seasonally adjusted using a classical
+    multiplicative decomposition before applying the theta method. The
+    resulting forecasts are then reseasonalised.
 
     In cases where SES results in a constant forecast, the theta forecaster
-    will revert
-    to predicting the SES constant plus a linear trend derived from the
-    training data.
+    will revert to predicting the SES constant plus a linear trend derived
+    from the training data.
 
     Prediction intervals are computed using the underlying state space model.
 
@@ -66,21 +63,17 @@ class ThetaForecaster(ExponentialSmoothing):
 
     References
     ----------
-    .. [1] `Assimakopoulos, V. and Nikolopoulos, K. The theta model: a
-    decomposition
-           approach to forecasting. International Journal of Forecasting 16,
-           521-530,
-           2000.
-           <https://www.sciencedirect.com/science/article/pii
-           /S0169207000000662>`_
+    .. [1] Assimakopoulos, V. and Nikolopoulos, K. The theta model: a
+       decomposition approach to forecasting. International Journal of
+       Forecasting 16, 521-530, 2000.
+       https://www.sciencedirect.com/science/article/pii/S0169207000000662
 
     .. [2] `Hyndman, Rob J., and Billah, Baki. Unmasking the Theta method.
-           International J. Forecasting, 19, 287-290, 2003.
-           <https://www.sciencedirect.com/science/article/pii
-           /S0169207001001431>`_
+       International J. Forecasting, 19, 287-290, 2003.
+       https://www.sciencedirect.com/science/article/pii/S0169207001001431
 
-    Example
-    ----------
+    Examples
+    --------
     >>> from sktime.datasets import load_airline
     >>> from sktime.forecasting.theta import ThetaForecaster
     >>> y = load_airline()
@@ -120,6 +113,7 @@ def _fit(self, y, X=None, fh=None):
             The forecasters horizon with the steps ahead to to predict.
         X : pd.DataFrame, optional (default=None)
             Exogenous variables are ignored
+
         Returns
         -------
         self : returns an instance of self.
@@ -227,8 +221,7 @@ def _update(self, y, X=None, update_params=True):
 
 
 def _zscore(level: float, two_tailed: bool = True) -> float:
-    """
-    Calculate a z-score from a confidence level.
+    """Calculate a z-score from a confidence level.
 
     Parameters
     ----------
diff --git a/sktime/forecasting/trend.py b/sktime/forecasting/trend.py
index 2276e0b73db..10c2b3fa168 100644
--- a/sktime/forecasting/trend.py
+++ b/sktime/forecasting/trend.py
@@ -1,6 +1,7 @@
-#!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
-"""copyright: sktime developers, BSD-3-Clause License (see LICENSE file)."""
+# !/usr/bin/env python3 -u
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements trend based forecaster."""
 
 __author__ = ["Anthony Jancso", "mloning"]
 __all__ = ["TrendForecaster", "PolynomialTrendForecaster"]
@@ -18,7 +19,7 @@
 
 
 class TrendForecaster(BaseForecaster):
-    """Forecast time series data.
+    """Trend based forecasts of time series data.
 
     Default settings train a linear regression model.
 
@@ -127,8 +128,8 @@ class PolynomialTrendForecaster(BaseForecaster):
         zero. (i.e. a column of ones, acts as an intercept term in a linear
         model)
 
-    Example
-    ----------
+    Examples
+    --------
     >>> from sktime.datasets import load_airline
     >>> from sktime.forecasting.trend import PolynomialTrendForecaster
     >>> y = load_airline()
diff --git a/sktime/performance_metrics/base/_base.py b/sktime/performance_metrics/base/_base.py
index 9b14afa13f7..3dd51c02253 100644
--- a/sktime/performance_metrics/base/_base.py
+++ b/sktime/performance_metrics/base/_base.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements base class for defining performance metric in sktime."""
 
 __author__ = ["Ryan Kuhns"]
 __all__ = ["BaseMetric"]
diff --git a/sktime/performance_metrics/forecasting/_classes.py b/sktime/performance_metrics/forecasting/_classes.py
index aae102b99bb..4e084fba85c 100644
--- a/sktime/performance_metrics/forecasting/_classes.py
+++ b/sktime/performance_metrics/forecasting/_classes.py
@@ -1,4 +1,13 @@
+#!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Metrics classes to assess performance on forecasting task.
+
+Classes named as ``*Score`` return a value to maximize: the higher the better.
+Classes named as ``*Error`` or ``*Loss`` return a value to minimize:
+the lower the better.
+"""
+
 from sktime.performance_metrics.base import BaseMetric
 from sktime.performance_metrics.forecasting._functions import (
     relative_loss,
diff --git a/sktime/performance_metrics/forecasting/_functions.py b/sktime/performance_metrics/forecasting/_functions.py
index 316587c0c94..e3cdd3f6b40 100644
--- a/sktime/performance_metrics/forecasting/_functions.py
+++ b/sktime/performance_metrics/forecasting/_functions.py
@@ -1,15 +1,13 @@
+#!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
-"""Metrics to assess performance on forecasting task.
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Metrics functions to assess performance on forecasting task.
 
-Functions named as ``*_score`` return a scalar value to maximize: the higher
-the better.
-Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize:
+Functions named as ``*_score`` return a value to maximize: the higher the better.
+Function named as ``*_error`` or ``*_loss`` return a value to minimize:
 the lower the better.
 """
 
-# !/usr/bin/env python3 -u
-# -*- coding: utf-8 -*-
-# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
 import numpy as np
 from scipy.stats import gmean
 from sklearn.utils.stats import _weighted_percentile
diff --git a/sktime/registry/__init__.py b/sktime/registry/__init__.py
index 37d160e6708..89ea2344494 100644
--- a/sktime/registry/__init__.py
+++ b/sktime/registry/__init__.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-"""Sktime registry module exports."""
+"""Implements registry for sktime estimator base classes and tags."""
 
 from sktime.registry._tags import (
     ESTIMATOR_TAG_REGISTER,
diff --git a/sktime/registry/_base_classes.py b/sktime/registry/_base_classes.py
index dd81ba5ecea..21a52a4cea4 100644
--- a/sktime/registry/_base_classes.py
+++ b/sktime/registry/_base_classes.py
@@ -1,6 +1,5 @@
 # -*- coding: utf-8 -*-
-"""
-Register of estimator base classes corresponding to sktime scitypes.
+"""Register of estimator base classes corresponding to sktime scitypes.
 
 This module exports the following:
 
diff --git a/sktime/registry/_tags.py b/sktime/registry/_tags.py
index 87261bdfe5f..ef829518fe2 100644
--- a/sktime/registry/_tags.py
+++ b/sktime/registry/_tags.py
@@ -1,6 +1,5 @@
 # -*- coding: utf-8 -*-
-"""
-Register of estimator and object tags.
+"""Register of estimator and object tags.
 
 Note for extenders: new tags should be entered in ESTIMATOR_TAG_REGISTER.
 No other place is necessary to add new tags.
diff --git a/sktime/regression/__init__.py b/sktime/regression/__init__.py
index 5fc57ca4930..0b57ec8e6a6 100644
--- a/sktime/regression/__init__.py
+++ b/sktime/regression/__init__.py
@@ -1,4 +1,2 @@
 # -*- coding: utf-8 -*-
-__all__ = ["ComposableTimeSeriesForestRegressor"]
-
-from sktime.regression.compose._ensemble import ComposableTimeSeriesForestRegressor
+"""Implements sktime estimators for time series regression."""
diff --git a/sktime/regression/all/__init__.py b/sktime/regression/all/__init__.py
index aae8534583e..046c7ee435f 100644
--- a/sktime/regression/all/__init__.py
+++ b/sktime/regression/all/__init__.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+"""Import all time series regression functionality available in sktime."""
 
 __author__ = ["Markus Löning"]
 __all__ = [
diff --git a/sktime/regression/base.py b/sktime/regression/base.py
index e0588c97ece..850a1fba1f6 100644
--- a/sktime/regression/base.py
+++ b/sktime/regression/base.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements base class for time series regression estimators in sktime."""
 
 __author__ = ["Markus Löning"]
 __all__ = ["BaseRegressor"]
@@ -9,17 +10,55 @@
 
 
 class BaseRegressor(BaseEstimator):
-    """
-    Base class for regressors, for identification.
-    """
+    """Base class for regressors, for identification."""
 
     def fit(self, X, y):
+        """Fit regressor to training data.
+
+        Parameters
+        ----------
+        X : pd.DataFrame, optional (default=None)
+            Exogeneous data
+        y : pd.Series, pd.DataFrame, or np.array
+            Target time series to which to fit the regressor.
+
+        Returns
+        -------
+        self :
+            Reference to self.
+        """
         raise NotImplementedError("abstract method")
 
     def predict(self, X):
+        """Predict time series.
+
+        Parameters
+        ----------
+        X : pd.DataFrame, shape=[n_obs, n_vars]
+            A2-d dataframe of exogenous variables.
+
+        Returns
+        -------
+        y_pred : pd.Series
+            Regression predictions.
+        """
         raise NotImplementedError("abstract method")
 
     def score(self, X, y):
+        """Scores regression against ground truth, R-squared.
+
+        Parameters
+        ----------
+        X : pd.DataFrame, shape=[n_obs, n_vars]
+            A2-d dataframe of exogenous variables.
+        y : pd.Series
+            Target time series to which to compare the predictions.
+
+        Returns
+        -------
+        score : float
+            R-squared score.
+        """
         from sklearn.metrics import r2_score
 
         return r2_score(y, self.predict(X))
diff --git a/sktime/regression/compose/__init__.py b/sktime/regression/compose/__init__.py
index 5fc57ca4930..e495edf384c 100644
--- a/sktime/regression/compose/__init__.py
+++ b/sktime/regression/compose/__init__.py
@@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+"""Implement composite time series regression estimators."""
+
 __all__ = ["ComposableTimeSeriesForestRegressor"]
 
 from sktime.regression.compose._ensemble import ComposableTimeSeriesForestRegressor
diff --git a/sktime/regression/compose/_ensemble.py b/sktime/regression/compose/_ensemble.py
index 09e491f6ee9..bb1243e4134 100644
--- a/sktime/regression/compose/_ensemble.py
+++ b/sktime/regression/compose/_ensemble.py
@@ -1,4 +1,8 @@
+#!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements a composite Time series Forest Regressor that accepts a pipeline."""
+
 __author__ = ["Markus Löning", "Ayushmaan Seth"]
 __all__ = ["ComposableTimeSeriesForestRegressor"]
 
@@ -285,14 +289,17 @@ def _validate_estimator(self):
 
     def predict(self, X):
         """Predict regression target for X.
+
         The predicted regression target of an input sample is computed as the
         mean predicted regression targets of the trees in the forest.
+
         Parameters
         ----------
         X : array-like or sparse matrix of shape = [n_samples, n_features]
             The input samples. Internally, its dtype will be converted to
             ``dtype=np.float32``. If a sparse matrix is provided, it will be
             converted into a sparse ``csr_matrix``.
+
         Returns
         -------
         y : array of shape = [n_samples] or [n_samples, n_outputs]
@@ -314,8 +321,7 @@ def predict(self, X):
         return np.sum(y_hat, axis=0) / len(self.estimators_)
 
     def _set_oob_score(self, X, y):
-        """
-        Compute out-of-bag scores."""
+        """Compute out-of-bag scores."""
         X, y = check_X_y(X, y, enforce_univariate=True)
 
         n_samples = y.shape[0]
diff --git a/sktime/regression/interval_based/__init__.py b/sktime/regression/interval_based/__init__.py
index fed0e429f51..5c4643097d8 100644
--- a/sktime/regression/interval_based/__init__.py
+++ b/sktime/regression/interval_based/__init__.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+"""Implement interval based time series regression estimators."""
 
 __author__ = ["Markus Löning"]
 __all__ = ["TimeSeriesForestRegressor"]
diff --git a/sktime/regression/interval_based/_tsf.py b/sktime/regression/interval_based/_tsf.py
index 677f9681fb9..1ee860a823e 100644
--- a/sktime/regression/interval_based/_tsf.py
+++ b/sktime/regression/interval_based/_tsf.py
@@ -1,7 +1,5 @@
 # -*- coding: utf-8 -*-
-"""
-    Time Series Forest Regressor (TSF).
-"""
+"""Time Series Forest Regressor (TSF)."""
 
 __author__ = ["Tony Bagnall", "kkoziara", "luiszugasti", "kanand77", "Markus Löning"]
 __all__ = ["TimeSeriesForestRegressor"]
@@ -36,10 +34,10 @@ class TimeSeriesForestRegressor(BaseTimeSeriesForest, ForestRegressor, BaseRegre
      intervals with replacement and does not use the splitting criteria tiny
      refinement described in [1]. This is an intentionally stripped down, non
      configurable version for use as a hive-cote component. For a configurable
-     tree based ensemble, see sktime.classifiers.ensemble.TimeSeriesForestClassifier
+     tree based ensemble, see sktime.classifiers.ensemble.TimeSeriesForestClassifier.
 
-     Parameters
-     ----------
+    Parameters
+    ----------
      n_estimators    : int, ensemble size, optional (default = 200)
      min_interval    : int, minimum width of an interval, optional (default
      to 3)
@@ -48,14 +46,14 @@ class TimeSeriesForestRegressor(BaseTimeSeriesForest, ForestRegressor, BaseRegre
          ``-1`` means using all processors.
      random_state    : int, seed for random, optional (default = none)
 
-     Attributes
-     ----------
+    Attributes
+    ----------
      n_classes    : int
      n_intervals  : int
      classes_    : List of classes for a given problem
 
-     References
-     ----------
+    References
+    ----------
      .. [1] H.Deng, G.Runger, E.Tuv and M.Vladimir, "A time series forest for
      classification and feature extraction",Information Sciences, 239, 2013
      Java implementation
@@ -67,7 +65,7 @@ class TimeSeriesForestRegressor(BaseTimeSeriesForest, ForestRegressor, BaseRegre
     _base_estimator = DecisionTreeRegressor()
 
     def predict(self, X):
-        """Predict
+        """Predict.
 
         Parameters
         ----------
diff --git a/sktime/series_as_features/base/estimators/_ensemble.py b/sktime/series_as_features/base/estimators/_ensemble.py
index b2984e4de88..aabf6983489 100644
--- a/sktime/series_as_features/base/estimators/_ensemble.py
+++ b/sktime/series_as_features/base/estimators/_ensemble.py
@@ -1,4 +1,8 @@
+#!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements base class for time series forests."""
+
 __author__ = ["Markus Löning", "Ayushmaan Seth"]
 __all__ = ["BaseTimeSeriesForest"]
 
@@ -42,8 +46,7 @@ def _parallel_build_trees(
     class_weight=None,
     n_samples_bootstrap=None,
 ):
-    """
-    Private function used to fit a single tree in parallel."""
+    """Private function used to fit a single tree in parallel."""
     if verbose > 1:
         print("building tree %d of %d" % (tree_idx + 1, n_trees))  # noqa: T001
 
@@ -80,9 +83,7 @@ def _parallel_build_trees(
 
 
 class BaseTimeSeriesForest(BaseForest):
-    """
-    Base class for forests of trees.
-    """
+    """Base class for forests of trees."""
 
     @abstractmethod
     def __init__(
@@ -113,6 +114,7 @@ def __init__(
 
     def _make_estimator(self, append=True, random_state=None):
         """Make and configure a copy of the `estimator_` attribute.
+
         Warning: This method should be used to properly instantiate new
         sub-estimators.
         """
@@ -128,8 +130,8 @@ def _make_estimator(self, append=True, random_state=None):
         return estimator
 
     def fit(self, X, y, sample_weight=None):
-        """
-        Build a forest of trees from the training set (X, y).
+        """Build a forest of trees from the training set (X, y).
+
         Parameters
         ----------
         X : array-like or sparse matrix of shape (n_samples, n_features)
@@ -145,6 +147,7 @@ def fit(self, X, y, sample_weight=None):
             ignored while searching for a split in each node. In the case of
             classification, splits are also ignored if they would result in any
             single class carrying a negative weight in either child node.
+
         Returns
         -------
         self : object
@@ -270,9 +273,14 @@ def fit(self, X, y, sample_weight=None):
         return self
 
     def apply(self, X):
+        """Abstract method that is implemented by concrete estimators."""
         raise NotImplementedError()
 
     def decision_path(self, X):
+        """Decision path of decision tree.
+
+        Abstract method that is implemented by concrete estimators.
+        """
         raise NotImplementedError()
 
     def _validate_X_predict(self, X):
@@ -288,7 +296,7 @@ def _validate_X_predict(self, X):
 
     @property
     def feature_importances_(self):
-        """Compute feature importances for time series forest"""
+        """Compute feature importances for time series forest."""
         # assumes particular structure of clf,
         # with each tree consisting of a particular pipeline,
         # as in modular tsf
diff --git a/sktime/transformations/series/__init__.py b/sktime/transformations/series/__init__.py
index e69de29bb2d..b35aa918a13 100644
--- a/sktime/transformations/series/__init__.py
+++ b/sktime/transformations/series/__init__.py
@@ -0,0 +1,2 @@
+# -*- coding: utf-8 -*-
+"""Module :mod:`sktime.transformations.series` implements series transformations."""
diff --git a/sktime/transformations/series/acf.py b/sktime/transformations/series/acf.py
index 52609eecd0f..f0cdda1dde9 100644
--- a/sktime/transformations/series/acf.py
+++ b/sktime/transformations/series/acf.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
 
-"""
-Auto-correlation transformations.
+"""Auto-correlation transformations.
 
 Module :mod:`sktime.transformations.series` implements auto-correlation
 transformers.
@@ -20,11 +20,10 @@
 
 
 class AutoCorrelationTransformer(_SeriesToSeriesTransformer):
-    """
-    Auto-correlation transformer.
+    """Auto-correlation transformer.
 
-    Example
-    -------
+    Examples
+    --------
     >>> from sktime.transformations.series.acf import PartialAutoCorrelationTransformer
     >>> from sklearn.preprocessing import MinMaxScaler
     >>> from sktime.datasets import load_airline
@@ -83,8 +82,7 @@ def transform(self, Z, X=None):
 
 
 class PartialAutoCorrelationTransformer(_SeriesToSeriesTransformer):
-    """
-    Partial auto-correlation transformer.
+    """Partial auto-correlation transformer.
 
     Parameters
     ----------
@@ -99,8 +97,8 @@ class PartialAutoCorrelationTransformer(_SeriesToSeriesTransformer):
         - ld or ldunbiased : Levinson-Durbin recursion with bias correction
         - ldb or ldbiased : Levinson-Durbin recursion without bias correction
 
-    Example
-    -------
+    Examples
+    --------
     >>> from sktime.transformations.series.acf import AutoCorrelationTransformer
     >>> from sklearn.preprocessing import MinMaxScaler
     >>> from sktime.datasets import load_airline
diff --git a/sktime/transformations/series/adapt.py b/sktime/transformations/series/adapt.py
index 04a29018f56..cba0ec1d0cb 100644
--- a/sktime/transformations/series/adapt.py
+++ b/sktime/transformations/series/adapt.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements adaptor for applying Scikit-learn-like transformers to time series."""
 
 __author__ = ["Markus Löning"]
 __all__ = ["TabularToSeriesAdaptor"]
@@ -32,8 +33,7 @@ def _from_2d_numpy_to_series(x, index=None):
 
 
 class TabularToSeriesAdaptor(_SeriesToSeriesTransformer):
-    """Adaptor for scikit-learn-like tabular transformations to series
-    setting.
+    """Adapt scikit-learn-like tabular transformations to series setting.
 
     This is useful for applying scikit-learn transformations to series,
     but only works with transformations that do not require multiple
@@ -44,8 +44,8 @@ class TabularToSeriesAdaptor(_SeriesToSeriesTransformer):
     transformer : Estimator
         scikit-learn-like transformer to fit and apply to series
 
-    Example
-    ----------
+    Examples
+    --------
     >>> from sktime.transformations.series.adapt import TabularToSeriesAdaptor
     >>> from sklearn.preprocessing import MinMaxScaler
     >>> from sktime.datasets import load_airline
@@ -82,6 +82,7 @@ def fit(self, Z, X=None):
 
     def transform(self, Z, X=None):
         """Transform data.
+
         Returns a transformed version of y.
 
         Parameters
diff --git a/sktime/transformations/series/boxcox.py b/sktime/transformations/series/boxcox.py
index 720f915e6fb..192bbeaa1cf 100644
--- a/sktime/transformations/series/boxcox.py
+++ b/sktime/transformations/series/boxcox.py
@@ -1,9 +1,10 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
-"""copyright: sktime developers, BSD-3-Clause License (see LICENSE file)."""
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file).
+"""Implmenents Box-Cox and Log Transformations."""
 
 __author__ = ["Markus Löning"]
-__all__ = ["BoxCoxTransformer"]
+__all__ = ["BoxCoxTransformer", "LogTransformer"]
 
 import numpy as np
 import pandas as pd
@@ -26,8 +27,8 @@
 class BoxCoxTransformer(_SeriesToSeriesTransformer):
     """Box-Cox power transform.
 
-    Example
-    -------
+    Examples
+    --------
     >>> from sktime.transformations.series.boxcox import BoxCoxTransformer
     >>> from sktime.datasets import load_airline
     >>> y = load_airline()
@@ -109,14 +110,53 @@ def inverse_transform(self, Z, X=None):
 
 
 class LogTransformer(_SeriesToSeriesTransformer):
+    """Log transformation.
+
+    Examples
+    --------
+    >>> from sktime.transformations.series.boxcox import LogTransformer
+    >>> from sktime.datasets import load_airline
+    >>> y = load_airline()
+    >>> transformer = LogTransformer()
+    >>> y_hat = transformer.fit_transform(y)
+    """
+
     _tags = {"transform-returns-same-time-index": True}
 
     def transform(self, Z, X=None):
+        """Transform data.
+
+        Parameters
+        ----------
+        Z : pd.Series
+            Series to transform.
+        X : pd.DataFrame, optional (default=None)
+            Exogenous data used in transformation.
+
+        Returns
+        -------
+        Zt : pd.Series
+            Transformed series.
+        """
         self.check_is_fitted()
         Z = check_series(Z)
         return np.log(Z)
 
     def inverse_transform(self, Z, X=None):
+        """Inverse transform data.
+
+        Parameters
+        ----------
+        Z : pd.Series
+            Series to transform.
+        X : pd.DataFrame, optional (default=None)
+            Exogenous data used in transformation.
+
+        Returns
+        -------
+        Zt : pd.Series
+            Transformed data - the inverse of the Box-Cox transformation.
+        """
         self.check_is_fitted()
         Z = check_series(Z)
         return np.exp(Z)
@@ -181,7 +221,7 @@ def _all(x):
 
 
 def _guerrero(x, sp, bounds=None):
-    r"""Return lambda estimated by the Guerrero method [Guerrero].
+    """Estimate lambda using the Guerrero method as described in [1]_.
 
     Parameters
     ----------
@@ -201,8 +241,8 @@ def _guerrero(x, sp, bounds=None):
 
     References
     ----------
-    [Guerrero] V.M. Guerrero, "Time-series analysis supported by Power
-    Transformations ", Journal of Forecasting, vol. 12, pp. 37-48, 1993.
+    .. [1] V.M. Guerrero, "Time-series analysis supported by Power
+       Transformations ", Journal of Forecasting, vol. 12, pp. 37-48, 1993.
     """
     if sp is None or not is_int(sp) or sp < 2:
         raise ValueError(
diff --git a/sktime/transformations/series/compose.py b/sktime/transformations/series/compose.py
index 1140a5c3a1b..c0302c020cb 100644
--- a/sktime/transformations/series/compose.py
+++ b/sktime/transformations/series/compose.py
@@ -1,11 +1,7 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
-
-"""Series-to-Series Transformers: OptionalPassthrough and Columnwisetransformer."""
-
-__author__ = ["Martin Walter", "Svea Meyer"]
-__all__ = ["OptionalPassthrough", "ColumnwiseTransformer"]
+"""Meta-transformers for building composite transformers."""
 
 import pandas as pd
 from sktime.transformations.base import _SeriesToSeriesTransformer
@@ -14,12 +10,14 @@
 from sklearn.base import clone
 from sklearn.utils.metaestimators import if_delegate_has_method
 
+__author__ = ["Martin Walter", "Svea Meyer"]
+__all__ = ["OptionalPassthrough", "ColumnwiseTransformer"]
+
 
 class OptionalPassthrough(_SeriesToSeriesTransformer):
-    """
-    Tune implicit hyperparameter.
+    """Wrap an existing transformer to tune whether to include it in a pipeline.
 
-    A transformer to tune the implicit hyperparameter whether or not to use a
+    Allows tuning the implicit hyperparameter whether or not to use a
     particular transformer inside a pipeline (e.g. TranformedTargetForecaster)
     or not. This is achived by having the additional hyperparameter
     "passthrough" which can be added to a grid then (see example).
@@ -32,8 +30,8 @@ class OptionalPassthrough(_SeriesToSeriesTransformer):
         This arg decides whether to apply the given transformer or to just
         passthrough the data (identity transformation)
 
-    Example
-    -------
+    Examples
+    --------
     >>> from sktime.datasets import load_airline
     >>> from sktime.forecasting.naive import NaiveForecaster
     >>> from sktime.transformations.series.compose import OptionalPassthrough
@@ -82,7 +80,7 @@ def __init__(self, transformer, passthrough=False):
         super(OptionalPassthrough, self).__init__()
 
     def fit(self, Z, X=None):
-        """Fit data.
+        """Fit the model.
 
         Parameters
         ----------
@@ -102,7 +100,7 @@ def fit(self, Z, X=None):
         return self
 
     def transform(self, Z, X=None):
-        """Transform data.
+        """Apply transformation.
 
         Parameters
         ----------
diff --git a/sktime/transformations/series/cos.py b/sktime/transformations/series/cos.py
index 8394253f4d3..e41b659a434 100644
--- a/sktime/transformations/series/cos.py
+++ b/sktime/transformations/series/cos.py
@@ -1,4 +1,8 @@
+#!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements cosine transformation."""
+
 import numpy as np
 
 from sktime.transformations.base import _SeriesToSeriesTransformer
@@ -9,9 +13,10 @@
 
 
 class CosineTransformer(_SeriesToSeriesTransformer):
-    """
-    Example
-    ----------
+    """Cosine transformation.
+
+    Examples
+    --------
     >>> from sktime.transformations.series.cos import CosineTransformer
     >>> from sktime.datasets import load_airline
     >>> y = load_airline()
@@ -22,6 +27,20 @@ class CosineTransformer(_SeriesToSeriesTransformer):
     _tags = {"transform-returns-same-time-index": True, "fit-in-transform": True}
 
     def transform(self, Z, X=None):
+        """Transform data.
+
+        Parameters
+        ----------
+        Z : pd.Series
+            Series to transform.
+        X : pd.DataFrame, optional (default=None)
+            Exogenous data used in transformation.
+
+        Returns
+        -------
+        Zt : pd.Series
+            Transformed series.
+        """
         self.check_is_fitted()
         Z = check_series(Z)
         return np.cos(Z)
diff --git a/sktime/transformations/series/detrend/_deseasonalize.py b/sktime/transformations/series/detrend/_deseasonalize.py
index f21b41deeac..7f3b7ba46ca 100644
--- a/sktime/transformations/series/detrend/_deseasonalize.py
+++ b/sktime/transformations/series/detrend/_deseasonalize.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements transformations to deseasonalize a timeseries."""
 
 __author__ = ["Markus Löning"]
 __all__ = [
@@ -22,8 +23,7 @@
 
 
 class Deseasonalizer(_SeriesToSeriesTransformer):
-    """A transformer that removes seasonal components from time
-    series.
+    """A transformer that removes seasonal components from time series.
 
     Parameters
     ----------
@@ -32,8 +32,8 @@ class Deseasonalizer(_SeriesToSeriesTransformer):
     model : str {"additive", "multiplicative"}, optional (default="additive")
         Model to use for estimating seasonal component
 
-    Example
-    ----------
+    Examples
+    --------
     >>> from sktime.transformations.series.detrend import Deseasonalizer
     >>> from sktime.datasets import load_airline
     >>> y = load_airline()
@@ -59,7 +59,7 @@ def _set_y_index(self, y):
         self._y_index = y.index
 
     def _align_seasonal(self, y):
-        """Align seasonal components with y's time index"""
+        """Align seasonal components with y's time index."""
         shift = (
             -_get_duration(
                 y.index[0],
@@ -115,6 +115,7 @@ def _inverse_transform(self, y, seasonal):
 
     def transform(self, Z, X=None):
         """Transform data.
+
         Returns a transformed version of y.
 
         Parameters
@@ -134,6 +135,7 @@ def transform(self, Z, X=None):
 
     def inverse_transform(self, Z, X=None):
         """Inverse transform data.
+
         Returns a transformed version of y.
 
         Parameters
@@ -152,7 +154,7 @@ def inverse_transform(self, Z, X=None):
         return self._inverse_transform(z, seasonal)
 
     def update(self, Z, X=None, update_params=False):
-        """Update fitted parameters
+        """Update fitted parameters.
 
         Parameters
         ----------
@@ -171,8 +173,7 @@ def update(self, Z, X=None, update_params=False):
 
 
 class ConditionalDeseasonalizer(Deseasonalizer):
-    """A transformer that removes seasonal components from time
-    series, conditional on seasonality test.
+    """Remove seasonal components from time series, conditional on seasonality test.
 
     Parameters
     ----------
@@ -192,8 +193,7 @@ def __init__(self, seasonality_test=None, sp=1, model="additive"):
         super(ConditionalDeseasonalizer, self).__init__(sp=sp, model=model)
 
     def _check_condition(self, y):
-        """Check if y meets condition"""
-
+        """Check if y meets condition."""
         if not callable(self.seasonality_test_):
             raise ValueError(
                 f"`func` must be a function/callable, but found: "
@@ -219,7 +219,6 @@ def fit(self, Z, X=None):
         -------
         self : an instance of self
         """
-
         z = check_series(Z, enforce_univariate=True)
         self._set_y_index(z)
         sp = check_sp(self.sp)
diff --git a/sktime/transformations/series/detrend/_detrend.py b/sktime/transformations/series/detrend/_detrend.py
index 833812df5c4..cb6066f0ceb 100644
--- a/sktime/transformations/series/detrend/_detrend.py
+++ b/sktime/transformations/series/detrend/_detrend.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements transformations to detrend a time series."""
 
 __all__ = ["Detrender"]
 __author__ = ["Markus Löning", "Svea Meyer"]
@@ -15,8 +16,8 @@
 
 
 class Detrender(_SeriesToSeriesTransformer):
-    """
-    Remove a trend from a series.
+    """Remove a trend from a series.
+
     This transformer uses any forecaster and returns the in-sample residuals
     of the forecaster's predicted values.
 
@@ -49,8 +50,8 @@ class Detrender(_SeriesToSeriesTransformer):
     forecaster_ : estimator object
         Model that defines the trend in the series
 
-    Example
-    ----------
+    Examples
+    --------
     >>> from sktime.transformations.series.detrend import Detrender
     >>> from sktime.forecasting.trend import PolynomialTrendForecaster
     >>> from sktime.datasets import load_airline
@@ -68,8 +69,7 @@ def __init__(self, forecaster=None):
         super(Detrender, self).__init__()
 
     def fit(self, Z, X=None):
-        """
-        Compute the trend in the series
+        """Compute the trend in the series.
 
         Parameters
         ----------
@@ -101,8 +101,7 @@ def fit(self, Z, X=None):
         return self
 
     def transform(self, Z, X=None):
-        """
-        Remove trend from the data.
+        """Remove trend from the data.
 
         Parameters
         ----------
@@ -142,8 +141,7 @@ def transform(self, Z, X=None):
             return z - z_pred
 
     def inverse_transform(self, Z, X=None):
-        """
-        Add trend back to a time series
+        """Add trend back to a time series.
 
         Parameters
         ----------
@@ -183,8 +181,7 @@ def inverse_transform(self, Z, X=None):
             return z + z_pred
 
     def update(self, Z, X=None, update_params=True):
-        """
-        Update the parameters of the detrending estimator with new data
+        """Update the parameters of the detrending estimator with new data.
 
         Parameters
         ----------
diff --git a/sktime/transformations/series/exponent.py b/sktime/transformations/series/exponent.py
index be3273bf9d7..e5fb0d8579f 100644
--- a/sktime/transformations/series/exponent.py
+++ b/sktime/transformations/series/exponent.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
-"""Classes to raise timeseries to a user provied exponent."""
+"""Implements transformers raise time series to user provided exponent."""
 
 __author__ = ["Ryan Kuhns"]
 __all__ = ["ExponentTransformer", "SqrtTransformer"]
@@ -42,8 +42,8 @@ class ExponentTransformer(_SeriesToSeriesTransformer):
     offset : int or float
         User supplied offset value.
 
-    Example
-    -------
+    Examples
+    --------
     >>> from sktime.transformations.series.exponent import ExponentTransformer
     >>> from sktime.datasets import load_airline
     >>> y = load_airline()
@@ -76,7 +76,6 @@ def _fit(self, Z, X=None):
         -------
         self
         """
-
         if not isinstance(self.power, (int, float)):
             raise ValueError(
                 f"Expected `power` to be int or float, but found {type(self.power)}."
@@ -216,8 +215,8 @@ class SqrtTransformer(ExponentTransformer):
     offset : int or float
         User supplied offset value.
 
-    Example
-    -------
+    Examples
+    --------
     >>> from sktime.transformations.series.exponent import SqrtTransformer
     >>> from sktime.datasets import load_airline
     >>> y = load_airline()
diff --git a/sktime/transformations/series/impute.py b/sktime/transformations/series/impute.py
index a7306bd8ecf..120d9007f71 100644
--- a/sktime/transformations/series/impute.py
+++ b/sktime/transformations/series/impute.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
 """Utilities to impute series with missing values."""
+
 __author__ = ["Martin Walter"]
 __all__ = ["Imputer"]
 
@@ -44,8 +46,8 @@ class Imputer(_SeriesToSeriesTransformer):
     random_state : int/float/str, optional
         Value to set random.seed() if method="random", default None
 
-    Example
-    ----------
+    Examples
+    --------
     >>> from sktime.transformations.series.impute import Imputer
     >>> from sktime.datasets import load_airline
     >>> y = load_airline()
diff --git a/sktime/transformations/series/matrix_profile.py b/sktime/transformations/series/matrix_profile.py
index 2186595e80d..a0d36843d2c 100644
--- a/sktime/transformations/series/matrix_profile.py
+++ b/sktime/transformations/series/matrix_profile.py
@@ -1,4 +1,7 @@
+#!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements matrix profile transformation."""
 
 __author__ = ["Markus Löning"]
 __all__ = ["MatrixProfileTransformer"]
@@ -16,7 +19,8 @@
 
 
 class MatrixProfileTransformer(_SeriesToSeriesTransformer):
-    """
+    """Calculate the matrix profile of a time series.
+
     Takes as input a single time series dataset and returns the matrix profile
     for that time series dataset.
 
@@ -24,10 +28,10 @@ class MatrixProfileTransformer(_SeriesToSeriesTransformer):
     ----------
     window_length : int
 
-    Example
-    ----------
-    # noqa:
-    >>> from sktime.transformations.series.matrix_profile import MatrixProfileTransformer
+    Examples
+    --------
+    >>> from sktime.transformations.series.matrix_profile import \
+    MatrixProfileTransformer
     >>> from sktime.datasets import load_airline
     >>> y = load_airline()
     >>> transformer = MatrixProfileTransformer()
@@ -41,14 +45,15 @@ def __init__(self, window_length=3):
         super(MatrixProfileTransformer, self).__init__()
 
     def transform(self, Z, X=None):
-        """
+        """Tranform data.
+
         Parameters
         ----------
         Z: pandas.Series
             Time series dataset(lets say of length=n)
 
         Returns
-        ----------
+        -------
         Z: pandas.Series
             Matrix Profile of time series as output with length as (n-window_length+1)
         """
diff --git a/sktime/transformations/series/outlier_detection.py b/sktime/transformations/series/outlier_detection.py
index 6d91aa05234..133e19d93cc 100644
--- a/sktime/transformations/series/outlier_detection.py
+++ b/sktime/transformations/series/outlier_detection.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements transformers for detecting outliers in a time series."""
 
 __author__ = ["Martin Walter"]
 __all__ = ["HampelFilter"]
@@ -14,8 +16,9 @@
 
 
 class HampelFilter(_SeriesToSeriesTransformer):
-    """HampelFilter to detect outliers based on a sliding window. Correction
-    of outliers is recommended by means of the sktime.Imputer,
+    """Use HampelFilter to detect outliers based on a sliding window.
+
+    Correction of outliers is recommended by means of the sktime.Imputer,
     so both can be tuned separately.
 
     Parameters
@@ -31,13 +34,17 @@ class HampelFilter(_SeriesToSeriesTransformer):
         If True, outliers are filled with True and non-outliers with False.
         Else, outliers are filled with np.nan.
 
+    Notes
+    -----
+    Implementation is based on [1]_.
+
     References
     ----------
-    Hampel F. R., "The influence curve and its role in robust estimation",
-    Journal of the American Statistical Association, 69, 382–393, 1974
+    .. [1] Hampel F. R., "The influence curve and its role in robust estimation",
+       Journal of the American Statistical Association, 69, 382–393, 1974
 
-    Example
-    ----------
+    Examples
+    --------
     >>> from sktime.transformations.series.outlier_detection import HampelFilter
     >>> from sktime.datasets import load_airline
     >>> y = load_airline()
@@ -61,6 +68,7 @@ def __init__(self, window_length=10, n_sigma=3, k=1.4826, return_bool=False):
 
     def transform(self, Z, X=None):
         """Transform data.
+
         Returns a transformed version of Z.
 
         Parameters
@@ -86,7 +94,8 @@ def transform(self, Z, X=None):
         return Z
 
     def _transform_series(self, Z):
-        """
+        """Logic internal to the algorithm for transforming the input series.
+
         Parameters
         ----------
         Z : pd.Series
@@ -161,7 +170,7 @@ def _hampel_filter(Z, cv, n_sigma, half_window_length, k):
 
 
 def _compare(value, cv_median, cv_sigma, n_sigma):
-    """Function to identify an outlier
+    """Identify an outlier.
 
     Parameters
     ----------
diff --git a/sktime/transformations/series/summarize.py b/sktime/transformations/series/summarize.py
index ce16c4a11f2..9cd812778b7 100644
--- a/sktime/transformations/series/summarize.py
+++ b/sktime/transformations/series/summarize.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Implements transformers for summarizing a time series."""
 
 __author__ = ["Markus Löning"]
 __all__ = ["MeanTransformer"]
@@ -11,10 +13,10 @@
 
 
 class MeanTransformer(_SeriesToPrimitivesTransformer):
-    """Get mean value of time series
+    """Get mean value of time series.
 
-    Example
-    ----------
+    Examples
+    --------
     >>> from sktime.transformations.series.summarize import MeanTransformer
     >>> from sktime.datasets import load_airline
     >>> y = load_airline()
@@ -23,7 +25,8 @@ class MeanTransformer(_SeriesToPrimitivesTransformer):
     """
 
     def transform(self, Z, X=None):
-        """
+        """Transform series.
+
         Parameters
         ----------
         Z : pd.Series
diff --git a/sktime/transformations/series/theta.py b/sktime/transformations/series/theta.py
index 27257df242e..cb2d31fab18 100644
--- a/sktime/transformations/series/theta.py
+++ b/sktime/transformations/series/theta.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3 -u
 # -*- coding: utf-8 -*-
-"""copyright: sktime developers, BSD-3-Clause License (see LICENSE file)."""
+# License: copyright: sktime developers, BSD-3-Clause License (see LICENSE file).
+"""Implements Theta-lines transformation for use with automatic theta forecasting."""
 
 __author__ = ["Guzal Bulatova", "Markus Löning"]
 __all__ = ["ThetaLinesTransformer"]
@@ -17,19 +18,23 @@
 class ThetaLinesTransformer(_SeriesToSeriesTransformer):
     """Decompose the original data into two or more Theta-lines.
 
-    Example
-    -------
+    Notes
+    -----
+    Implements decomposition as described in [1]_.
+
+    References
+    ----------
+    .. [1] E.Spiliotis et al., "Generalizing the Theta method for
+       automatic forecasting ", European Journal of Operational
+       Research, vol. 284, pp. 550-558, 2020.
+
+    Examples
+    --------
     >>> from sktime.transformations.series.theta import ThetaLinesTransformer
     >>> from sktime.datasets import load_airline
     >>> y = load_airline()
     >>> transformer = ThetaLinesTransformer([0, 0.25, 0.5, 0.75])
     >>> y_thetas = transformer.fit_transform(y)
-
-    References
-    ----------
-    [1] E.Spiliotis et al., "Generalizing the Theta method for
-    automatic forecasting ", European Journal of Operational
-    Research, vol. 284, pp. 550-558, 2020.
     """
 
     _tags = {