docs:Additional examples and explanations of using Quantile regression pipelines.

antononcube · antononcube · commit 1ba34b004f9f · 2024-08-30T09:46:21.000-04:00
diff --git a/docs/Anomalies-detection-via-Quantile-regression.ipynb b/docs/Anomalies-detection-via-Quantile-regression.ipynb
@@ -0,0 +1,180 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Anomalies detection via Quantile regression\n",
+    "\n",
+    "Anton Antonov    \n",
+    "[PythonForPrediction at WordPress](https://pythonforprediction.wordpress.com)   \n",
+    "August 2024"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Introduction"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "------\n",
+    "\n",
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load the \"Regressionizer\" and other \"standard\" packages:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from Regressionizer import *\n",
+    "from OutlierIdentifiers import *\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import plotly.express as px\n",
+    "import plotly.graph_objects as go"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template='plotly_dark'\n",
+    "data_color='darkgray'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "-----\n",
+    "\n",
+    "## Get data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = \"https://raw.githubusercontent.com/antononcube/SimplifiedMachineLearningWorkflows-book/master/R/ChampaignUrbanaDataScienceUserGroup-Meetup-February-2021/data/dfAppleMobilityLongForm.csv\"\n",
+    "dfMobilityData = pd.read_csv(url)\n",
+    "dfMobilityData['DateObject'] = pd.to_datetime(dfMobilityData['Date'], format='%Y-%m-%d')\n",
+    "dfMobilityData = dfMobilityData.sort_values(by=\"Date\")\n",
+    "dfMobilityData"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Convert to \"numpy\" array: "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "usage_data = dfMobilityData[['Date', 'Value']].to_numpy()\n",
+    "usage_data[:,0] = dates_to_seconds(usage_data[:,0], epoch_start=\"1900-01-01\")\n",
+    "#usage_data = usage_data[usage_data[:, 0].argsort()]\n",
+    "usage_data.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here is pipeline for Quantile Regression computation and making of a corresponding plot:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "obj = (\n",
+    "    Regressionizer(usage_data)\n",
+    "    .echo_data_summary()\n",
+    "    .quantile_regression(knots=50, probs=[0.2])\n",
+    "    .date_list_plot(title=\"Apple mobility data\", template=template, data_color=data_color, width = 1200)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Show the obtained plot:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = obj.take_value()\n",
+    "#fig.add_trace(go.Scatter(x=to_datetime_index(usage_data[:,0]), y=usage_data[:,1], mode='lines', name='Data time series'))\n",
+    "fig.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "outliers = (obj\n",
+    ".find_anomalies_by_residuals(\n",
+    "    relative_errors=True,\n",
+    "    threshold=None, \n",
+    "    outlier_identifier=quartile_identifier_parameters)\n",
+    ".take_value());\n",
+    "\n",
+    "fig.add_trace(go.Scatter(x=to_datetime_index(outliers[:,0]), y=outliers[:,1], mode='markers', name='Outliers', marker_color = \"orange\"))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "SciPyCentric",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/Rapid-specification-of-regression-workflows.ipynb b/docs/Rapid-specification-of-regression-workflows.ipynb
@@ -428,8 +428,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pandas as pd\n",
-    "\n",
     "url = \"https://raw.githubusercontent.com/antononcube/MathematicaVsR/master/Data/MathematicaVsR-Data-Atlanta-GA-USA-Temperature.csv\"\n",
     "dfTemperature = pd.read_csv(url)\n",
     "dfTemperature['DateObject'] = pd.to_datetime(dfTemperature['Date'], format='%Y-%m-%d')\n",
@@ -497,6 +495,32 @@
     "obj.take_value().show()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "840d366e",
+   "metadata": {},
+   "source": [
+    "Here we show the fractions of the number of points under each regression quantile:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "121ef46c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "obj.separate(cumulative=True, fractions=True).take_value()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ec67780b",
+   "metadata": {},
+   "source": [
+    "**Remark:** If the quantile regression algorithms work correctly then the cumulation separation fractions correspond -- i.e. are nearly equal - to the probabilities of the regression quantiles."
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "faa93dbb",
@@ -616,7 +640,7 @@
     "        date_plot=True, \n",
     "        template=template,\n",
     "        data_color=data_color,\n",
-    "        width = 1600, height = 400)\n",
+    "        width = 1200, height = 400)\n",
     ")\n",
     "\n",
     "obj.take_value().show()"