Skip to content

Commit 1ba34b0

Browse files
committed
docs:Additional examples and explanations of using Quantile regression pipelines.
1 parent b12c8f4 commit 1ba34b0

2 files changed

+207
-3
lines changed
Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Anomalies detection via Quantile regression\n",
8+
"\n",
9+
"Anton Antonov \n",
10+
"[PythonForPrediction at WordPress](https://pythonforprediction.wordpress.com) \n",
11+
"August 2024"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"## Introduction"
19+
]
20+
},
21+
{
22+
"cell_type": "markdown",
23+
"metadata": {},
24+
"source": [
25+
"------\n",
26+
"\n",
27+
"## Setup"
28+
]
29+
},
30+
{
31+
"cell_type": "markdown",
32+
"metadata": {},
33+
"source": [
34+
"Load the \"Regressionizer\" and other \"standard\" packages:"
35+
]
36+
},
37+
{
38+
"cell_type": "code",
39+
"execution_count": null,
40+
"metadata": {},
41+
"outputs": [],
42+
"source": [
43+
"from Regressionizer import *\n",
44+
"from OutlierIdentifiers import *\n",
45+
"\n",
46+
"import numpy as np\n",
47+
"import pandas as pd\n",
48+
"import plotly.express as px\n",
49+
"import plotly.graph_objects as go"
50+
]
51+
},
52+
{
53+
"cell_type": "code",
54+
"execution_count": null,
55+
"metadata": {},
56+
"outputs": [],
57+
"source": [
58+
"template='plotly_dark'\n",
59+
"data_color='darkgray'"
60+
]
61+
},
62+
{
63+
"cell_type": "markdown",
64+
"metadata": {},
65+
"source": [
66+
"-----\n",
67+
"\n",
68+
"## Get data"
69+
]
70+
},
71+
{
72+
"cell_type": "code",
73+
"execution_count": null,
74+
"metadata": {},
75+
"outputs": [],
76+
"source": [
77+
"url = \"https://raw.githubusercontent.com/antononcube/SimplifiedMachineLearningWorkflows-book/master/R/ChampaignUrbanaDataScienceUserGroup-Meetup-February-2021/data/dfAppleMobilityLongForm.csv\"\n",
78+
"dfMobilityData = pd.read_csv(url)\n",
79+
"dfMobilityData['DateObject'] = pd.to_datetime(dfMobilityData['Date'], format='%Y-%m-%d')\n",
80+
"dfMobilityData = dfMobilityData.sort_values(by=\"Date\")\n",
81+
"dfMobilityData"
82+
]
83+
},
84+
{
85+
"cell_type": "markdown",
86+
"metadata": {},
87+
"source": [
88+
"Convert to \"numpy\" array: "
89+
]
90+
},
91+
{
92+
"cell_type": "code",
93+
"execution_count": null,
94+
"metadata": {},
95+
"outputs": [],
96+
"source": [
97+
"usage_data = dfMobilityData[['Date', 'Value']].to_numpy()\n",
98+
"usage_data[:,0] = dates_to_seconds(usage_data[:,0], epoch_start=\"1900-01-01\")\n",
99+
"#usage_data = usage_data[usage_data[:, 0].argsort()]\n",
100+
"usage_data.shape"
101+
]
102+
},
103+
{
104+
"cell_type": "markdown",
105+
"metadata": {},
106+
"source": [
107+
"Here is pipeline for Quantile Regression computation and making of a corresponding plot:"
108+
]
109+
},
110+
{
111+
"cell_type": "code",
112+
"execution_count": null,
113+
"metadata": {},
114+
"outputs": [],
115+
"source": [
116+
"obj = (\n",
117+
" Regressionizer(usage_data)\n",
118+
" .echo_data_summary()\n",
119+
" .quantile_regression(knots=50, probs=[0.2])\n",
120+
" .date_list_plot(title=\"Apple mobility data\", template=template, data_color=data_color, width = 1200)\n",
121+
")"
122+
]
123+
},
124+
{
125+
"cell_type": "markdown",
126+
"metadata": {},
127+
"source": [
128+
"Show the obtained plot:"
129+
]
130+
},
131+
{
132+
"cell_type": "code",
133+
"execution_count": null,
134+
"metadata": {},
135+
"outputs": [],
136+
"source": [
137+
"fig = obj.take_value()\n",
138+
"#fig.add_trace(go.Scatter(x=to_datetime_index(usage_data[:,0]), y=usage_data[:,1], mode='lines', name='Data time series'))\n",
139+
"fig.show()\n"
140+
]
141+
},
142+
{
143+
"cell_type": "code",
144+
"execution_count": null,
145+
"metadata": {},
146+
"outputs": [],
147+
"source": [
148+
"outliers = (obj\n",
149+
".find_anomalies_by_residuals(\n",
150+
" relative_errors=True,\n",
151+
" threshold=None, \n",
152+
" outlier_identifier=quartile_identifier_parameters)\n",
153+
".take_value());\n",
154+
"\n",
155+
"fig.add_trace(go.Scatter(x=to_datetime_index(outliers[:,0]), y=outliers[:,1], mode='markers', name='Outliers', marker_color = \"orange\"))"
156+
]
157+
}
158+
],
159+
"metadata": {
160+
"kernelspec": {
161+
"display_name": "SciPyCentric",
162+
"language": "python",
163+
"name": "python3"
164+
},
165+
"language_info": {
166+
"codemirror_mode": {
167+
"name": "ipython",
168+
"version": 3
169+
},
170+
"file_extension": ".py",
171+
"mimetype": "text/x-python",
172+
"name": "python",
173+
"nbconvert_exporter": "python",
174+
"pygments_lexer": "ipython3",
175+
"version": "3.12.2"
176+
}
177+
},
178+
"nbformat": 4,
179+
"nbformat_minor": 2
180+
}

docs/Rapid-specification-of-regression-workflows.ipynb

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -428,8 +428,6 @@
428428
"metadata": {},
429429
"outputs": [],
430430
"source": [
431-
"import pandas as pd\n",
432-
"\n",
433431
"url = \"https://raw.githubusercontent.com/antononcube/MathematicaVsR/master/Data/MathematicaVsR-Data-Atlanta-GA-USA-Temperature.csv\"\n",
434432
"dfTemperature = pd.read_csv(url)\n",
435433
"dfTemperature['DateObject'] = pd.to_datetime(dfTemperature['Date'], format='%Y-%m-%d')\n",
@@ -497,6 +495,32 @@
497495
"obj.take_value().show()"
498496
]
499497
},
498+
{
499+
"cell_type": "markdown",
500+
"id": "840d366e",
501+
"metadata": {},
502+
"source": [
503+
"Here we show the fractions of the number of points under each regression quantile:"
504+
]
505+
},
506+
{
507+
"cell_type": "code",
508+
"execution_count": null,
509+
"id": "121ef46c",
510+
"metadata": {},
511+
"outputs": [],
512+
"source": [
513+
"obj.separate(cumulative=True, fractions=True).take_value()"
514+
]
515+
},
516+
{
517+
"cell_type": "markdown",
518+
"id": "ec67780b",
519+
"metadata": {},
520+
"source": [
521+
"**Remark:** If the quantile regression algorithms work correctly then the cumulation separation fractions correspond -- i.e. are nearly equal - to the probabilities of the regression quantiles."
522+
]
523+
},
500524
{
501525
"cell_type": "markdown",
502526
"id": "faa93dbb",
@@ -616,7 +640,7 @@
616640
" date_plot=True, \n",
617641
" template=template,\n",
618642
" data_color=data_color,\n",
619-
" width = 1600, height = 400)\n",
643+
" width = 1200, height = 400)\n",
620644
")\n",
621645
"\n",
622646
"obj.take_value().show()"

0 commit comments

Comments
 (0)