Skip to content

Commit b8024f1

Browse files
committed
WIP
1 parent 8822792 commit b8024f1

9 files changed

+36720
-7
lines changed

Python/load_json_staging.ipynb

+35,760
Large diffs are not rendered by default.

README.md

+24-7
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,27 @@
1-
# sql_example
1+
# SQL and Python for Loading and Querying Cricket Data
22

3-
This repository includes Python and SQL code that demonstrates competency in loading data to a star schema in SQL and performing queries on that data.
3+
This repository includes Python and SQL code that demonstrates competency in loading data to a star schema in SQL and performing queries on that data.
44

5-
The data loaded is cricket match data stored in JSON.
6-
- Python is utilised to load the JSON to a staging database in SQL server.
7-
- SQL is utilised to create staging and star schema databases, load staging data to the star schema and several information extraction queries (e.g. all cricket matches play in a specific location).
5+
JSON data on cricket matches spanning the past 20 years was obtained from https://cricsheet.org/ (approximately 17,700 JSON files, each file representing a single match). Python was used to read, process and load those JSON files to SQL server, SQL was used to create databases, load, transform and query the cricket match data.
86

9-
The files below are included:
10-
- TBC
7+
Collectively, this demonstrates an end-to-end load, transformation and use of data. The specific compentencies displayed include:
8+
- Utilising Python to load and flatten JSON files and load them to a SQL database
9+
- Data modelling a star schema database to support querying information
10+
- SQL for creating databases
11+
- SQL for data extraction, transforming and loading to databases
12+
- SQL for information querying utilising various techniques (e.g. JOINS, FILTERS, GROUPING, RANKING)
13+
14+
15+
16+
The table below shows the files included in this respository.
17+
18+
| File name | Type | Purpose |
19+
|-----------------------------------|--------|----------------------------------------------------------------------------------------------|
20+
| 1. create_cricket_db_staging | SQL | Create staging database - JSON is loaded here with Python script |
21+
| 2. create_cricket_db | SQL | Create star schema database - populated from staging database |
22+
| 3. tidy_staging | SQL | Performs some basic data cleansing of data in staging |
23+
| 4. populate_dim_date | SQL | Populates the date dimension in the star schema |
24+
| 4. populate_star_schema | SQL | Populates all Fact, Bridge and Dimension tables in the star schema from staging data |
25+
| 5. check_star_schema_load | SQL | Performs several data quality checks of staging to identify any potential errors |
26+
| 6. information_extraction_queries | SQL | Several queries to extract information on cricket matches from the newly created star schema |
27+
| load_json_staging | Python | Loads JSON source files to the staging database |

SQL/1. create_cricket_db_staging.sql

+86
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
-- Create staging database
2+
-- Create or select the database
3+
IF NOT EXISTS (SELECT * FROM sys.databases WHERE name = 'CricketDBStaging')
4+
BEGIN
5+
CREATE DATABASE CricketDBStaging;
6+
END;
7+
GO
8+
9+
-- Select the database
10+
USE CricketDBStaging;
11+
GO
12+
13+
-- Drop existing tables if they exist (for a fresh start)
14+
IF OBJECT_ID('stg_wickets', 'U') IS NOT NULL DROP TABLE stg_wickets;
15+
IF OBJECT_ID('stg_deliveries', 'U') IS NOT NULL DROP TABLE stg_deliveries;
16+
IF OBJECT_ID('stg_overs', 'U') IS NOT NULL DROP TABLE stg_overs;
17+
IF OBJECT_ID('stg_innings', 'U') IS NOT NULL DROP TABLE stg_innings;
18+
IF OBJECT_ID('stg_matches', 'U') IS NOT NULL DROP TABLE stg_matches;
19+
GO
20+
21+
22+
CREATE TABLE stg_matches (
23+
match_id INT PRIMARY KEY IDENTITY(1,1),
24+
match_date NVARCHAR(MAX),
25+
city NVARCHAR(100),
26+
match_type NVARCHAR(50),
27+
result NVARCHAR(50),
28+
team1 NVARCHAR(50),
29+
team2 NVARCHAR(50),
30+
winning_team NVARCHAR(50),
31+
win_type NVARCHAR(50),
32+
win_margin NVARCHAR(50),
33+
event_name NVARCHAR(MAX),
34+
team1_players NVARCHAR(MAX),
35+
team2_players NVARCHAR(MAX),
36+
source_filename NVARCHAR(50),
37+
load_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
38+
);
39+
GO
40+
41+
-- Create table for innings
42+
CREATE TABLE stg_innings (
43+
innings_id INT PRIMARY KEY IDENTITY(1,1),
44+
match_id INT,
45+
innings_number INT,
46+
team NVARCHAR(100) NOT NULL,
47+
load_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
48+
FOREIGN KEY (match_id) REFERENCES stg_matches(match_id),
49+
);
50+
GO
51+
52+
-- Create table for overs
53+
CREATE TABLE stg_overs (
54+
over_id INT PRIMARY KEY IDENTITY(1,1),
55+
innings_id INT,
56+
over_number INT,
57+
load_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
58+
FOREIGN KEY (innings_id) REFERENCES stg_innings(innings_id)
59+
);
60+
GO
61+
62+
CREATE TABLE stg_deliveries (
63+
delivery_id INT PRIMARY KEY IDENTITY(1,1),
64+
over_id INT,
65+
batter NVARCHAR(100),
66+
bowler NVARCHAR(100),
67+
non_striker NVARCHAR(100),
68+
delivery_number INT,
69+
total_runs INT,
70+
batter_runs INT,
71+
extras INT,
72+
wicket_taken BIT DEFAULT 0,
73+
load_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
74+
FOREIGN KEY (over_id) REFERENCES stg_overs(over_id)
75+
);
76+
GO
77+
78+
CREATE TABLE stg_wickets (
79+
wicket_id INT PRIMARY KEY IDENTITY(1,1),
80+
delivery_id INT,
81+
player_out NVARCHAR(100),
82+
kind NVARCHAR(100),
83+
load_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
84+
FOREIGN KEY (delivery_id) REFERENCES stg_deliveries(delivery_id)
85+
);
86+
GO

SQL/2. create_cricket_db.sql

+163
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
-- Create or select the database
2+
IF NOT EXISTS (SELECT * FROM sys.databases WHERE name = 'CricketDB')
3+
BEGIN
4+
CREATE DATABASE CricketDB;
5+
END;
6+
GO
7+
8+
-- Select the database
9+
USE CricketDB;
10+
GO
11+
12+
-- Drop existing tables if they exist (for a fresh start)
13+
IF OBJECT_ID('MatchTeamBridge', 'U') IS NOT NULL DROP TABLE MatchTeamBridge;
14+
IF OBJECT_ID('MatchPlayerBridge', 'U') IS NOT NULL DROP TABLE MatchPlayerBridge;
15+
IF OBJECT_ID('MatchDateBridge', 'U') IS NOT NULL DROP TABLE MatchDateBridge;
16+
IF OBJECT_ID('MatchInningsBridge', 'U') IS NOT NULL DROP TABLE MatchInningsBridge;
17+
IF OBJECT_ID('FactMatch', 'U') IS NOT NULL DROP TABLE FactMatch;
18+
IF OBJECT_ID('DimDate', 'U') IS NOT NULL DROP TABLE DimDate;
19+
IF OBJECT_ID('DimCity', 'U') IS NOT NULL DROP TABLE DimCity;
20+
IF OBJECT_ID('DimWicket', 'U') IS NOT NULL DROP TABLE DimWicket;
21+
IF OBJECT_ID('DimDeliveries', 'U') IS NOT NULL DROP TABLE DimDeliveries;
22+
IF OBJECT_ID('DimOvers', 'U') IS NOT NULL DROP TABLE DimOvers;
23+
IF OBJECT_ID('DimInnings', 'U') IS NOT NULL DROP TABLE DimInnings;
24+
IF OBJECT_ID('DimPlayer', 'U') IS NOT NULL DROP TABLE DimPlayer;
25+
IF OBJECT_ID('DimTeam', 'U') IS NOT NULL DROP TABLE DimTeam;
26+
GO
27+
28+
29+
-- Create DimDate
30+
CREATE TABLE DimDate (
31+
date_id INT PRIMARY KEY IDENTITY(1,1),
32+
date DATE NOT NULL,
33+
day_of_week INT NOT NULL,
34+
day_of_month INT NOT NULL,
35+
month INT NOT NULL,
36+
year INT NOT NULL,
37+
quarter INT NOT NULL,
38+
day_name VARCHAR(20) NOT NULL,
39+
month_name VARCHAR(20) NOT NULL,
40+
is_weekend BIT NOT NULL
41+
);
42+
GO
43+
44+
-- Create DimTeam
45+
CREATE TABLE DimTeam (
46+
team_id INT PRIMARY KEY IDENTITY(1,1),
47+
team_name VARCHAR(255) NOT NULL
48+
);
49+
GO
50+
51+
-- Create DimPlayer
52+
CREATE TABLE DimPlayer (
53+
player_id INT PRIMARY KEY IDENTITY(1,1),
54+
player_name VARCHAR(255) NOT NULL
55+
);
56+
GO
57+
58+
-- Create DimVenue
59+
CREATE TABLE DimCity (
60+
city_id INT PRIMARY KEY IDENTITY(1,1),
61+
city VARCHAR(255) NOT NULL,
62+
);
63+
GO
64+
65+
-- Create DimInnings
66+
CREATE TABLE DimInnings (
67+
innings_id INT PRIMARY KEY,
68+
innings_number INT NOT NULL, -- 1st innings, 2nd innings, etc.
69+
team_id INT NOT NULL, -- Batting team
70+
FOREIGN KEY (team_id) REFERENCES DimTeam(team_id)
71+
);
72+
GO
73+
74+
-- Create DimOver
75+
CREATE TABLE DimOvers (
76+
over_id INT PRIMARY KEY,
77+
innings_id INT NOT NULL,
78+
over_number INT NOT NULL,
79+
FOREIGN KEY (innings_id) REFERENCES DimInnings(innings_id)
80+
);
81+
GO
82+
83+
-- Create DimDelivery
84+
CREATE TABLE DimDeliveries (
85+
delivery_id INT PRIMARY KEY,
86+
over_id INT NOT NULL,
87+
delivery_number INT NOT NULL,
88+
bowler_id INT NOT NULL,
89+
batsman_id INT NOT NULL,
90+
non_striker_id INT NOT NULL,
91+
total_runs INT NOT NULL,
92+
batter_runs INT NOT NULL,
93+
extras INT DEFAULT 0, -- Byes, leg-byes, no-balls, etc.
94+
wicket_taken BIT DEFAULT 0,
95+
FOREIGN KEY (over_id) REFERENCES DimOvers(over_id),
96+
FOREIGN KEY (bowler_id) REFERENCES DimPlayer(player_id),
97+
FOREIGN KEY (batsman_id) REFERENCES DimPlayer(player_id),
98+
FOREIGN KEY (non_striker_id) REFERENCES DimPlayer(player_id)
99+
);
100+
GO
101+
102+
-- Create DimWicket
103+
CREATE TABLE DimWicket (
104+
wicket_id INT PRIMARY KEY,
105+
delivery_id INT NOT NULL,
106+
batsman_out_id INT NOT NULL,
107+
kind VARCHAR(50), -- Caught, Bowled, LBW, etc.
108+
FOREIGN KEY (delivery_id) REFERENCES DimDeliveries(delivery_id),
109+
FOREIGN KEY (batsman_out_id) REFERENCES DimPlayer(player_id)
110+
);
111+
GO
112+
113+
-- Create FactMatch
114+
CREATE TABLE FactMatch (
115+
match_id INT PRIMARY KEY,
116+
city_id INT NOT NULL,
117+
match_type NVARCHAR(50),
118+
result NVARCHAR(50),
119+
team1_id INT NOT NULL,
120+
team2_id INT NOT NULL,
121+
total_overs INT,
122+
total_runs INT,
123+
winning_team_id INT,
124+
win_type NVARCHAR(50),
125+
win_margin INT,
126+
event_name NVARCHAR(MAX),
127+
FOREIGN KEY (city_id) REFERENCES DimCity(city_id),
128+
FOREIGN KEY (winning_team_id) REFERENCES DimTeam(team_id)
129+
);
130+
GO
131+
132+
-- Create MatchPlayerBridge
133+
CREATE TABLE MatchPlayerBridge (
134+
player_id INT NOT NULL,
135+
match_id INT NOT NULL,
136+
team_id INT NOT NULL,
137+
PRIMARY KEY (match_id, player_id, team_id),
138+
FOREIGN KEY (match_id) REFERENCES FactMatch(match_id),
139+
FOREIGN KEY (team_id) REFERENCES DimTeam(team_id),
140+
FOREIGN KEY (player_id) REFERENCES DimPlayer(player_id)
141+
);
142+
GO
143+
144+
-- Create MatchDateBridge
145+
CREATE TABLE MatchDateBridge (
146+
match_id INT NOT NULL,
147+
date_id INT NOT NULL,
148+
PRIMARY KEY (match_id, date_id),
149+
FOREIGN KEY (match_id) REFERENCES FactMatch(match_id),
150+
FOREIGN KEY (date_id) REFERENCES DimDate(date_id)
151+
);
152+
GO
153+
154+
-- Create MatchDateBridge
155+
CREATE TABLE MatchInningsBridge (
156+
match_id INT NOT NULL,
157+
innings_id INT NOT NULL,
158+
PRIMARY KEY (match_id, innings_id),
159+
FOREIGN KEY (match_id) REFERENCES FactMatch(match_id),
160+
FOREIGN KEY (innings_id) REFERENCES DimInnings(innings_id)
161+
);
162+
GO
163+

SQL/3. tidy_staging.sql

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
-- Script to tidy up data in staging before loading to star schema
2+
3+
UPDATE stg_matches
4+
SET team1_players = REPLACE(team1_players, 'S Fouch\u00c3\u00a9', 'S Fouche')
5+
WHERE team1_players like '%S Fouch\u00c3\u00a9%'
6+
7+
UPDATE stg_matches
8+
SET team2_players = REPLACE(team2_players, 'S Fouch\u00c3\u00a9', 'S Fouche')
9+
WHERE team2_players like '%S Fouch\u00c3\u00a9%'
10+
11+
UPDATE stg_deliveries
12+
SET batter = REPLACE(batter, 'S Fouché', 'S Fouche')
13+
WHERE batter like '%S Fouché%'
14+
15+
UPDATE stg_deliveries
16+
SET bowler = REPLACE(bowler, 'S Fouché', 'S Fouche')
17+
WHERE bowler like '%S Fouché%'
18+
19+
UPDATE stg_deliveries
20+
SET non_striker = REPLACE(non_striker, 'S Fouché', 'S Fouche')
21+
WHERE non_striker like '%S Fouché%'
22+
23+
UPDATE stg_wickets
24+
SET player_out = REPLACE(player_out, 'S Fouché', 'S Fouche')
25+
WHERE player_out like '%S Fouché%'

SQL/4. populate_dim_date.sql

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
-- POPULATE DATE DIMENSION
2+
3+
-- Declare the date range you want to seed
4+
DECLARE @StartDate DATE = '2000-01-01';
5+
DECLARE @EndDate DATE = '2030-12-31';
6+
7+
-- Create a temporary table to hold all dates in the range
8+
-- Uses a Recursive CTE which creates a temporary table that is populated with date values until the end date is reached
9+
WITH DateSequence AS (
10+
SELECT @StartDate AS DateValue
11+
UNION ALL
12+
SELECT DATEADD(DAY, 1, DateValue)
13+
FROM DateSequence
14+
WHERE DateValue < @EndDate
15+
)
16+
17+
-- Insert into DimDate
18+
INSERT INTO [CricketDB].[dbo].[DimDate] (date, day_of_week, day_of_month, month, year, quarter, day_name, month_name, is_weekend)
19+
SELECT
20+
DateValue AS date, -- Date
21+
DATEPART(WEEKDAY, DateValue) AS day_of_week, -- Day of the week (1=Sunday, 7=Saturday)
22+
DATEPART(DAY, DateValue) AS day_of_month, -- Day of the month (1-31)
23+
DATEPART(MONTH, DateValue) AS month, -- Month (1-12)
24+
DATEPART(YEAR, DateValue) AS year, -- Year
25+
DATEPART(QUARTER, DateValue) AS quarter, -- Quarter (1-4)
26+
DATENAME(WEEKDAY, DateValue) AS day_name, -- Day name (e.g., 'Monday')
27+
DATENAME(MONTH, DateValue) AS month_name, -- Month name (e.g., 'January')
28+
CASE
29+
WHEN DATEPART(WEEKDAY, DateValue) IN (1, 7) THEN 1 -- Weekend (1 for weekend, 0 for weekday)
30+
ELSE 0
31+
END AS is_weekend
32+
FROM DateSequence
33+
OPTION (MAXRECURSION 0); -- Remove recursion limit

0 commit comments

Comments
 (0)