-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPIFU MI.py
361 lines (270 loc) · 10.3 KB
/
PIFU MI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
# Databricks notebook source
from env import env
from src import utils, excel
import openpyxl
import pandas as pd
from pyspark.sql import functions as F
from datetime import datetime
from openpyxl.styles import NamedStyle
# COMMAND ----------
#Load PIFU data
df_raw_pifu = spark.read.option("header","true").option("recursiveFileLookup","true").parquet(env["pifu_path"])
wb = openpyxl.load_workbook('report_template.xlsx')
report_start = 'August 2021 to '
#display (df_raw_pifu)
publishing_month = df_raw_pifu.select(F.max("EROC_DerMonth")).collect()[0][0]
publishing_month = datetime.strptime(publishing_month, '%Y-%m-%d')
publishing_month = publishing_month.strftime("%B %Y")
date_header = (report_start + publishing_month)
# COMMAND ----------
# MAGIC %md
# MAGIC #specialty
# COMMAND ----------
#creating the specialty sheet data frame
df_tfc_pifu = (df_raw_pifu
.where(F.col("EROC_DerMetricReportingName") == "Moved and Discharged")
.where(F.col("EROC_DerMonth") > "2021-03-01")
.groupBy(
"RTT_Specialty_code",
"RTT_Specialty_Description",
"EROC_DerMonth" )
.agg(F.sum("EROC_Value").alias("Moved_or_Discharged") )
.orderBy("EROC_DerMonth", "RTT_Specialty_code")
.select("EROC_DerMonth", "RTT_Specialty_code", "RTT_Specialty_Description", "Moved_or_Discharged")
.withColumn("RTT_Specialty_Description", F.regexp_replace("RTT_Specialty_Description","–", "-"))
)
display(df_tfc_pifu)
# COMMAND ----------
#pivoting the specialty sheet data frame
df_tfc_pivot = (df_tfc_pifu
.groupBy("RTT_Specialty_code", "RTT_Specialty_Description")
.pivot("EROC_DerMonth")
.agg(F.sum("Moved_or_Discharged") )
.orderBy("RTT_Specialty_code")
.withColumnRenamed("RTT_Specialty_code", "RTT Specialty Code")
.withColumnRenamed("RTT_Specialty_Description", "RTT Specialty Description")
)
display(df_tfc_pivot)
# COMMAND ----------
#creating enland totals
#pivot must come straight after groupby
df_England_pivot = (df_tfc_pifu
.groupBy()
.pivot("EROC_DerMonth")
.agg(F.sum("Moved_or_Discharged") )
)
#inserting into pandas
#only use toPandas on final data frame
df_England_pivot_pd = df_England_pivot.toPandas()
display(df_England_pivot)
# COMMAND ----------
#creating a loop to copy date format across to new and existing data columns
for column in df_tfc_pivot.columns[2:]:
month_format = datetime.strptime(column, '%Y-%m-%d')
month_format = month_format.strftime("%b-%Y")
df_tfc_pivot = df_tfc_pivot.withColumnRenamed(column, month_format)
# COMMAND ----------
#converting the moved and discharged data inot pandas
df_tfc_pivot_pd = df_tfc_pivot.toPandas()
# COMMAND ----------
#inserting pandas dataframe into excel sheet
ws_speciality = wb['PIFU | England & Specialty']
excel.insert_pandas_df_into_excel(
df = df_tfc_pivot_pd,
ws = ws_speciality,
header = True,
startrow = 11,
startcol = 2,
index = False,
)
#instering england totals
excel.insert_pandas_df_into_excel(
df= df_England_pivot_pd,
ws = ws_speciality,
header = False,
startrow = 36,
startcol = 4,
index = False,
)
# COMMAND ----------
#adding date header (in description)
ws_speciality.cell(row=3, column=3).value = date_header
# COMMAND ----------
#defining boundaries of cells being copied
number_of_months = df_tfc_pifu.select(F.countDistinct("EROC_DerMonth")).collect()[0][0]
new_months = number_of_months - 38
pre_date_columns = 3
copy_column = 38 + pre_date_columns
end_column = number_of_months + pre_date_columns + 1
# COMMAND ----------
print(number_of_months)
# COMMAND ----------
#copying styles from template to new data
for column_number in range(copy_column , end_column):
for row_number in range(11, 37):
cell_to_copy_from = ws_speciality.cell(row=row_number, column=copy_column)
cell_to_copy_to = ws_speciality.cell(row=row_number, column=column_number)
excel.copy_all_cell_styles(cell_to_copy_from, cell_to_copy_to)
# COMMAND ----------
# Define the number format style
number_style = NamedStyle(name="number", number_format="0")
# Apply the number format to the specified range
for row in ws_speciality.iter_rows(min_row=12, max_row=36, min_col=pre_date_columns + 1, max_col=end_column):
for cell in row:
cell.number_format = number_style.number_format
# COMMAND ----------
# MAGIC %md
# MAGIC #month and org
# COMMAND ----------
df_processed_pifu = (df_raw_pifu
.where ( F.col("EROC_DerMetricReportingName") == "Moved and Discharged")
.where ( F.col("EROC_DerMonth") > '2021-03-01')
.groupby(
"EROC_DerMonth",
"EROC_DerProviderCode",
"EROC_DerProviderName",
"EROC_DerRegionName",
"EROC_DerRegionCode",
"EROC_DerICBCode",
"EROC_DerICBName",
"EROC_DerProviderAcuteStatus"
)
.agg( F.sum("EROC_Value").alias("Moved_or_Discharged"))
.orderBy(
"EROC_DerMonth",
"EROC_DerProviderCode"
)
)
# --For ICB/Region filters, remove the filter for 'Acute' providers, and include the relevant ICB/Region fields for aggregation.
display (df_processed_pifu)
# COMMAND ----------
#putting the data in pivot table format
df_provider_pivot = (df_processed_pifu
.groupby(
"EROC_DerRegionCode",
"EROC_DerRegionName",
"EROC_DerICBCode",
"EROC_DerICBName",
"EROC_DerProviderCode",
"EROC_DerProviderName",
"EROC_DerProviderAcuteStatus",
)
.pivot("EROC_DerMonth")
.agg(F.sum("Moved_or_Discharged"))
.orderBy(
"EROC_DerRegionName",
"EROC_DerICBCode",
"EROC_DerProviderCode"
)
.withColumnRenamed("EROC_DerRegionCode", "Region Code")
.withColumnRenamed("EROC_DerRegionName", "Region Name")
.withColumnRenamed("EROC_DerICBCode", "ICB Code")
.withColumnRenamed("EROC_DerICBName", "ICB Name")
.withColumnRenamed("EROC_DerProviderCode", "Provider Code")
.withColumnRenamed("EROC_DerProviderName", "Provider Name")
.withColumnRenamed("EROC_DerProviderAcuteStatus", "Acute Status")
)
display(df_provider_pivot)
# COMMAND ----------
#formatting date headers along the pivot
for column in df_provider_pivot.columns[7:]:
month_format = datetime.strptime(column, '%Y-%m-%d')
month_format = month_format.strftime("%b-%Y")
df_provider_pivot = df_provider_pivot.withColumnRenamed(column, month_format)
print(df_provider_pivot.columns)
# COMMAND ----------
#converting the pivot to pandas databframe
df_pd_provider_pivot = df_provider_pivot.toPandas()
# COMMAND ----------
#creating a workbook
ws_provider = wb['PIFU | By Org & Month']
excel.insert_pandas_df_into_excel(
df = df_pd_provider_pivot,
ws = wb['PIFU | By Org & Month'],
header = True,
startrow = 11,
startcol = 2,
index = False,
)
# Check width of table and get column numbers of unfomatted columns
number_of_months = df_processed_pifu.select("EROC_DerMonth").distinct().count()
new_months = number_of_months - 42
pre_date_columns = 8
copy_column = 42 + pre_date_columns
end_column = number_of_months + pre_date_columns + 1
# copy and paste formatting onto unformatted columns
for column_number in range (copy_column, end_column):
for row_number in range(11,154):
cell_to_copy_from = ws_provider.cell(row=row_number, column=copy_column)
cell_to_paste_to = ws_provider.cell(row=row_number, column=column_number)
excel.copy_all_cell_styles(cell_to_copy_from, cell_to_paste_to)
# get height of table and get row numbers of unformatted rows
number_of_providers = df_processed_pifu.select("EROC_DerProviderName").distinct().count()
new_provider = number_of_providers - 142
pre_table_rows = 11
copy_row = pre_table_rows + 142
end_row = copy_row + new_provider + 1
# copy and paste formatting onto unformatted rows
for row_number in range (copy_row, end_row):
for column_number in range(2,end_column):
cell_to_copy_from = ws_provider.cell(row=copy_row, column=column_number)
cell_to_paste_to = ws_provider.cell(row= row_number, column=column_number)
excel.copy_all_cell_styles(cell_to_copy_from, cell_to_paste_to)
# Use the values calculated above to get the cell range of the whole table
conditional_formatting_start_cell = "I11"
conditional_formatting_end_col = openpyxl.utils.cell.get_column_letter(end_column - 1)
conditional_formatting_end_row = end_row - 1
conditional_formatting_end_cell = conditional_formatting_end_col + str(conditional_formatting_end_row)
conditional_formatting_range = f"{conditional_formatting_start_cell}:{conditional_formatting_end_cell}"
print(conditional_formatting_range)
# Copy the existing conditional formatting rule, but make it cover the whole table using the range created above.
for rule_name, rule in ws_provider.conditional_formatting._cf_rules.items():
ws_provider.conditional_formatting.add(conditional_formatting_range, rule[0])
# COMMAND ----------
#updating publishing date header
ws_provider.cell(row=3, column=3).value = date_header
# COMMAND ----------
from openpyxl.styles import NamedStyle
# Define the number format style
number_style = NamedStyle(name="number", number_format="0")
# Apply the number format to the specified range
for row in ws_provider.iter_rows(min_row=12, max_row=end_row, min_col=pre_date_columns + 1, max_col=end_column):
for cell in row:
cell.number_format = number_style.number_format
# COMMAND ----------
# MAGIC %md
# MAGIC #csv files
# MAGIC
# COMMAND ----------
#Month provider CSV
df_month_provider_csv = (df_processed_pifu
.select (
"EROC_DerProviderCode",
"EROC_DerProviderName",
"EROC_DerRegionName",
"EROC_DerRegionCode",
"EROC_DerICBCode",
"EROC_DerICBName",
"EROC_DerProviderAcuteStatus",
"EROC_DerMonth",
"Moved_or_Discharged")
)
df_month_provider_csv_pd = df_month_provider_csv.toPandas()
#specialty csv
df_specialty_csv = (df_tfc_pifu
.select (
"RTT_Specialty_code",
"RTT_Specialty_Description",
"EROC_DerMonth",
"Moved_or_Discharged")
)
df_specialty_csv_pd = df_specialty_csv.toPandas()
#save
df_month_provider_csv_pd.to_csv('outputs/PIFU_MI_Month_Provider.csv', index=False)
df_specialty_csv_pd.to_csv('outputs/PIFU_MI_Specialty.csv', index=False)
# COMMAND ----------
# MAGIC %md
# MAGIC #save report
# COMMAND ----------
# Save the workbook with the DataFrame inserted
wb.save('outputs/PIFU_MI.xlsx')