Skip to content

Commit 6f0b47d

Browse files
authored
supporting content for ingesting-data-with-big-query (#367)
1 parent f598877 commit 6f0b47d

File tree

1 file changed

+391
-0
lines changed

1 file changed

+391
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,391 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {
6+
"id": "LlrEjmtJNpuX"
7+
},
8+
"source": [
9+
"# Ingesting data with BigQuery\n",
10+
"\n",
11+
"This notebook demonstrates how to consume data contained in BigQuery and index it into Elasticsearch. This notebook is based on the article [Ingesting data with BigQuery](https://www.elastic.co/search-labs/blog/ingesting-data-with-big-query)."
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {
17+
"id": "GNaAN-GNO5qp"
18+
},
19+
"source": [
20+
"## Installing dependencies and importing packages"
21+
]
22+
},
23+
{
24+
"cell_type": "code",
25+
"execution_count": null,
26+
"metadata": {
27+
"colab": {
28+
"base_uri": "https://localhost:8080/"
29+
},
30+
"id": "qgclZayCk1Ct",
31+
"outputId": "7da1e962-ead6-4016-b2e5-12a019885d86"
32+
},
33+
"outputs": [],
34+
"source": [
35+
"!pip install google-cloud-bigquery elasticsearch==8.16 google-auth"
36+
]
37+
},
38+
{
39+
"cell_type": "code",
40+
"execution_count": null,
41+
"metadata": {
42+
"id": "rAesontNXpLu"
43+
},
44+
"outputs": [],
45+
"source": [
46+
"from elasticsearch import Elasticsearch, exceptions\n",
47+
"from google.cloud import bigquery\n",
48+
"from google.colab import auth\n",
49+
"from getpass import getpass\n",
50+
"\n",
51+
"import json"
52+
]
53+
},
54+
{
55+
"cell_type": "markdown",
56+
"metadata": {
57+
"id": "NwOmnk99Pfh3"
58+
},
59+
"source": [
60+
"## Declaring variables"
61+
]
62+
},
63+
{
64+
"cell_type": "markdown",
65+
"metadata": {
66+
"id": "-4sV9fiXdBwj"
67+
},
68+
"source": [
69+
"This code will create inputs where you can enter your credentials.\n",
70+
"Here you can learn how to retrieve your Elasticsearch credentials: [Finding Your Cloud ID](https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#finding-your-cloud-id)."
71+
]
72+
},
73+
{
74+
"cell_type": "code",
75+
"execution_count": null,
76+
"metadata": {
77+
"colab": {
78+
"base_uri": "https://localhost:8080/"
79+
},
80+
"id": "GVKJKfFpPWuj",
81+
"outputId": "21c6f4e3-8cb5-4a2c-8efe-0e45c3c6b1c4"
82+
},
83+
"outputs": [],
84+
"source": [
85+
"ELASTICSEARCH_ENDPOINT = getpass(\"Elasticsearch endpoint: \")\n",
86+
"ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n",
87+
"\n",
88+
"\n",
89+
"# Google Cloud project name and BigQuery dataset name\n",
90+
"PROJECT_ID = \"elasticsearch-bigquery\"\n",
91+
"# dataset_id in format <your-project-name>.<your-dataset-name>\n",
92+
"DATASET_ID = f\"{PROJECT_ID}.server_logs\""
93+
]
94+
},
95+
{
96+
"cell_type": "markdown",
97+
"metadata": {
98+
"id": "3O2HclcYHEsS"
99+
},
100+
"source": [
101+
"## Instance a Elasticsearch client"
102+
]
103+
},
104+
{
105+
"cell_type": "code",
106+
"execution_count": null,
107+
"metadata": {
108+
"id": "1LWiop8NYiQF"
109+
},
110+
"outputs": [],
111+
"source": [
112+
"auth.authenticate_user()\n",
113+
"\n",
114+
"# Elasticsearch client\n",
115+
"es_client = Elasticsearch(\n",
116+
" ELASTICSEARCH_ENDPOINT,\n",
117+
" api_key=ELASTIC_API_KEY,\n",
118+
")"
119+
]
120+
},
121+
{
122+
"cell_type": "markdown",
123+
"metadata": {
124+
"id": "9lvPHaXjPlfu"
125+
},
126+
"source": [
127+
"## Creating mappings"
128+
]
129+
},
130+
{
131+
"cell_type": "code",
132+
"execution_count": null,
133+
"metadata": {
134+
"id": "tc88YzAYw31e"
135+
},
136+
"outputs": [],
137+
"source": [
138+
"try:\n",
139+
" es_client.indices.create(\n",
140+
" index=\"bigquery-logs\",\n",
141+
" body={\n",
142+
" \"mappings\": {\n",
143+
" \"properties\": {\n",
144+
" \"status_code_description\": {\"type\": \"match_only_text\"},\n",
145+
" \"status_code\": {\"type\": \"keyword\"},\n",
146+
" \"@timestamp\": {\"type\": \"date\"},\n",
147+
" \"ip_address\": {\"type\": \"ip\"},\n",
148+
" \"http_method\": {\"type\": \"keyword\"},\n",
149+
" \"endpoint\": {\"type\": \"keyword\"},\n",
150+
" \"response_time\": {\"type\": \"integer\"},\n",
151+
" }\n",
152+
" }\n",
153+
" },\n",
154+
" )\n",
155+
"except exceptions.RequestError as e:\n",
156+
" if e.error == \"resource_already_exists_exception\":\n",
157+
" print(\"Index already exists.\")\n",
158+
" else:\n",
159+
" raise e"
160+
]
161+
},
162+
{
163+
"cell_type": "markdown",
164+
"metadata": {
165+
"id": "AOtwAfPXP38Z"
166+
},
167+
"source": [
168+
"## Getting data from BigQuery"
169+
]
170+
},
171+
{
172+
"cell_type": "code",
173+
"execution_count": null,
174+
"metadata": {
175+
"colab": {
176+
"base_uri": "https://localhost:8080/"
177+
},
178+
"id": "DyKtsjXRXB2S",
179+
"outputId": "7734ad7e-cb11-42cc-b97d-e9241cab6b17"
180+
},
181+
"outputs": [],
182+
"source": [
183+
"client = bigquery.Client(project=PROJECT_ID)\n",
184+
"# Getting tables from dataset\n",
185+
"tables = client.list_tables(DATASET_ID)\n",
186+
"\n",
187+
"data = {}\n",
188+
"\n",
189+
"for table in tables:\n",
190+
" # Table id must be in format <dataset_name>.<table_name>\n",
191+
" table_id = f\"{DATASET_ID}.{table.table_id}\"\n",
192+
"\n",
193+
" print(f\"Processing table: {table.table_id}\")\n",
194+
"\n",
195+
" # Query to retrieve BigQuery tables data\n",
196+
" query = f\"\"\"\n",
197+
" SELECT *\n",
198+
" FROM `{table_id}`\n",
199+
" \"\"\"\n",
200+
"\n",
201+
" query_job = client.query(query)\n",
202+
"\n",
203+
" results = query_job.result()\n",
204+
"\n",
205+
" print(f\"Results for table: {table.table_id}:\")\n",
206+
"\n",
207+
" data[table.table_id] = []\n",
208+
"\n",
209+
" for row in results:\n",
210+
" # Saving data with key=table_id\n",
211+
" data[table.table_id].append(dict(row))\n",
212+
" print(row)"
213+
]
214+
},
215+
{
216+
"cell_type": "code",
217+
"execution_count": null,
218+
"metadata": {
219+
"colab": {
220+
"base_uri": "https://localhost:8080/"
221+
},
222+
"id": "UAznwqXStJ39",
223+
"outputId": "508a3255-43f7-4828-87d5-997cd04ca427"
224+
},
225+
"outputs": [],
226+
"source": [
227+
"# variable with data\n",
228+
"logs_data = data[\"logs\"]\n",
229+
"\n",
230+
"\n",
231+
"print(logs_data)"
232+
]
233+
},
234+
{
235+
"cell_type": "markdown",
236+
"metadata": {
237+
"id": "2s4Tr6wBP773"
238+
},
239+
"source": [
240+
"## Indexing to Elasticsearch"
241+
]
242+
},
243+
{
244+
"cell_type": "code",
245+
"execution_count": null,
246+
"metadata": {
247+
"colab": {
248+
"base_uri": "https://localhost:8080/"
249+
},
250+
"id": "C4goyH6ZbDJK",
251+
"outputId": "cfb4af41-1ef1-40da-a2da-6c0aa83633d3"
252+
},
253+
"outputs": [],
254+
"source": [
255+
"bulk_data = []\n",
256+
"\n",
257+
"for log_entry in logs_data:\n",
258+
" # Convert timestamp to ISO 8601 string\n",
259+
" timestamp_iso8601 = log_entry[\"_timestamp\"].isoformat()\n",
260+
"\n",
261+
" # Prepare action metadata\n",
262+
" action_metadata = {\n",
263+
" \"index\": {\n",
264+
" \"_index\": \"bigquery-logs\",\n",
265+
" \"_id\": f\"{log_entry['ip_address']}-{timestamp_iso8601}\",\n",
266+
" }\n",
267+
" }\n",
268+
"\n",
269+
" # Prepare document\n",
270+
" document = {\n",
271+
" \"ip_address\": log_entry[\"ip_address\"],\n",
272+
" \"status_code\": log_entry[\"status_code\"],\n",
273+
" \"@timestamp\": timestamp_iso8601,\n",
274+
" \"http_method\": log_entry[\"http_method\"],\n",
275+
" \"endpoint\": log_entry[\"endpoint\"],\n",
276+
" \"response_time\": log_entry[\"response_time\"],\n",
277+
" \"status_code_description\": log_entry[\"status_code_description\"],\n",
278+
" }\n",
279+
"\n",
280+
" # Append to bulk data\n",
281+
" bulk_data.append(action_metadata)\n",
282+
" bulk_data.append(document)\n",
283+
"\n",
284+
"print(bulk_data)"
285+
]
286+
},
287+
{
288+
"cell_type": "code",
289+
"execution_count": null,
290+
"metadata": {
291+
"colab": {
292+
"base_uri": "https://localhost:8080/"
293+
},
294+
"id": "WPEwsJrFbDHQ",
295+
"outputId": "ab5904f7-21c1-4596-fb06-55569cd9eb17"
296+
},
297+
"outputs": [],
298+
"source": [
299+
"try:\n",
300+
" # Indexing data\n",
301+
" response = es_client.bulk(body=bulk_data)\n",
302+
"\n",
303+
" if response[\"errors\"]:\n",
304+
" print(\"Errors while indexing:\")\n",
305+
" for item in response[\"items\"]:\n",
306+
" if \"error\" in item[\"index\"]:\n",
307+
" print(item[\"index\"][\"error\"])\n",
308+
" else:\n",
309+
" print(\"Documents indexed successfully.\")\n",
310+
"except Exception as e:\n",
311+
" print(f\"Bulk indexing failed: {e}\")"
312+
]
313+
},
314+
{
315+
"cell_type": "markdown",
316+
"metadata": {
317+
"id": "uxix_o8LQDup"
318+
},
319+
"source": [
320+
"# Searching data"
321+
]
322+
},
323+
{
324+
"cell_type": "code",
325+
"execution_count": null,
326+
"metadata": {
327+
"colab": {
328+
"base_uri": "https://localhost:8080/"
329+
},
330+
"id": "285_MwI8yknk",
331+
"outputId": "67d43aca-b05d-43f7-c730-1fa3bc0c4662"
332+
},
333+
"outputs": [],
334+
"source": [
335+
"response = es_client.search(\n",
336+
" index=\"bigquery-logs\",\n",
337+
" body={\n",
338+
" \"query\": {\"match\": {\"status_code_description\": \"error\"}},\n",
339+
" \"sort\": [{\"@timestamp\": {\"order\": \"desc\"}}],\n",
340+
" \"aggs\": {\"by_ip\": {\"terms\": {\"field\": \"ip_address\", \"size\": 10}}},\n",
341+
" },\n",
342+
")\n",
343+
"\n",
344+
"# Print results\n",
345+
"formatted_json = json.dumps(response.body, indent=4)\n",
346+
"print(formatted_json)"
347+
]
348+
},
349+
{
350+
"cell_type": "markdown",
351+
"metadata": {
352+
"id": "S6WZMJayyzxh"
353+
},
354+
"source": [
355+
"## Deleting\n",
356+
"\n",
357+
"Finally, we can delete the resources used to prevent them from consuming resources."
358+
]
359+
},
360+
{
361+
"cell_type": "code",
362+
"execution_count": null,
363+
"metadata": {
364+
"colab": {
365+
"base_uri": "https://localhost:8080/"
366+
},
367+
"id": "9UcQwa41yy_x",
368+
"outputId": "02a33d89-b57c-4273-ba93-5b6441a4f91e"
369+
},
370+
"outputs": [],
371+
"source": [
372+
"# Cleanup - Delete Index\n",
373+
"es_client.indices.delete(index=\"bigquery-logs\", ignore=[400, 404])"
374+
]
375+
}
376+
],
377+
"metadata": {
378+
"colab": {
379+
"provenance": []
380+
},
381+
"kernelspec": {
382+
"display_name": "Python 3",
383+
"name": "python3"
384+
},
385+
"language_info": {
386+
"name": "python"
387+
}
388+
},
389+
"nbformat": 4,
390+
"nbformat_minor": 0
391+
}

0 commit comments

Comments
 (0)