-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathetl_regression_analysis.R
50 lines (36 loc) · 1.46 KB
/
etl_regression_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
library(readxl)
library(ggplot2)
perform_etl <- function(file_path) {
if (!file.exists(file_path)) {
stop("File not found. Please provide a valid file path.")
}
file_extension <- tools::file_ext(file_path)
if (!file_extension %in% c("csv", "xlsx", "xls")) {
stop("Unsupported file type. Please provide a CSV or Excel file.")
}
data <- switch(file_extension,
csv = read.csv(file_path),
xlsx = read_excel(file_path),
xls = read_excel(file_path, sheet = 1))
numeric_data <- Filter(is.numeric, data)
correlated_pairs <- combn(names(numeric_data), 2, simplify = FALSE)
for (pair in correlated_pairs) {
x <- numeric_data[[pair[1]]]
y <- numeric_data[[pair[2]]]
correlation <- cor(x, y)
if (correlation > 0.7) {
model <- lm(y ~ x)
plot_data <- data.frame(x = x, y = y)
plot <- ggplot(plot_data, aes(x = x, y = y)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = paste("Linear Regression:", pair[1], "vs", pair[2]),
subtitle = paste("Correlation:", round(correlation, 2)))
output_file <- paste0("regression_plot_", pair[1], "_vs_", pair[2], ".png")
ggsave(output_file, plot)
cat("Generated plot:", output_file, "\n")
}
}
}
file_path <- "path/to/file.xlsx"
perform_etl(file_path)