Create chisquared.R

kdyson · kdyson · commit 0130c64cfc14 · 2021-07-26T18:48:52.000-07:00
adding chi squared multiple questions multiple demographic variable result generation AND post hoc tests
diff --git a/chisquared.R b/chisquared.R
@@ -0,0 +1,258 @@
+# likertTable for all three functions should be formatted with question columns
+# and demographic (or grouping) columns, each row should be a different
+# respondents answer. Cells should be e.g. likert responses (More, etc.) or
+# demographic data (e.g. age groups)
+
+## Results table for the addKruskal and addChiSquared functions should be formulated like this:
+
+#
+# activityGardensResults <-
+#   tibble(
+#     demographicVariable = c(
+#       "Gender",
+#       "Marital Status",
+#       "Age",
+#       "Housing",
+#       "Town Type",
+#       "Governorate",
+#       "Employment",
+#       "Work Location",
+#       "Education",
+#       "Before Income",
+#       "After Income"
+#     ),
+#     afterGardensRelaxing_CHI = rep(0.0, length(demographicVariable)),
+#     afterGardensRelaxing_PVAL = rep(0.0, length(demographicVariable)),
+#     afterGardensRelaxing_CORPVAL = rep(0.0, length(demographicVariable)),
+#     afterGardensExercise_CHI = rep(0.0, length(demographicVariable)),
+#     afterGardensExercise_PVAL = rep(0.0, length(demographicVariable)),
+#     afterGardensExercise_CORPVAL = rep(0.0, length(demographicVariable)),
+#     afterGardensBirdPhotography_CHI = rep(0.0, length(demographicVariable)),
+#     afterGardensBirdPhotography_PVAL = rep(0.0, length(demographicVariable)),
+#     afterGardensBirdPhotography_CORPVAL = rep(0.0, length(demographicVariable))
+#
+#   )
+
+
+
+addKruskal <-
+  function(likertTable,
+           resultsTable,
+           questionColumns,
+           demographicColumns) {
+    i = min(demographicColumns) # row
+    j = min(questionColumns) # column
+    
+    a = 1 # the first row in the resultsTable that should have results assigned.
+    c = 4 # first column that should have corrected pvalues
+    
+    # Iterate through each demographic variable
+    for (i in min(demographicColumns):max(demographicColumns)) {
+      b = 2 # the first column in the resultsTable that should have results assigned to it
+      
+      for (j in min(questionColumns):max(questionColumns)) {
+        # Create a temp table and filter out blanks.
+        tempTable <- likertTable[, c(j, i)]
+        tempTable <- tempTable[tempTable[2] != "",]
+        
+        # perform the kruskal wallis test
+        tempKruskal <- kruskal.test(tempTable[, 1] ~ tempTable[, 2])
+        
+        # add statistic value and pvalue to the table.
+        resultsTable[a, b] <- tempKruskal$statistic
+        resultsTable[a, b + 1] <- tempKruskal$p.value
+        
+        # add 2 so next results go in the proper place
+        b = b + 3
+      } # end question loop
+      
+      a = a + 1 # go to next row for next set of demographic info
+      
+    } # end demographic loop
+    
+    # add corrected pvalue
+    
+    z = 1
+    for (z in 1:length(questionColumns)) {
+      tempCorrected <-
+        p.adjust(as.vector(unlist(resultsTable[, c - 1])), method = "holm")
+      
+      resultsTable[, c] <- tempCorrected
+      
+      c = c + 3
+      
+    } # end addition of corrected pvalues
+    
+    return(resultsTable)
+    
+  } # end function
+
+
+
+addChiSquared <-
+  function(likertTable,
+           resultsTable,
+           questionColumns,
+           demographicColumns,
+           minVal = 25) {
+    i = min(demographicColumns) # row
+    j = min(questionColumns) # column
+    
+    a = 1 # the first row in the resultsTable that should have results assigned.
+    c = 4 # first column that should have corrected pvalues
+    
+    # Iterate through each demographic variable
+    for (i in min(demographicColumns):max(demographicColumns)) {
+      b = 2 # the first column in the resultsTable that should have results assigned to it
+      
+      for (j in min(questionColumns):max(questionColumns)) {
+        # Create a temp table and filter out blanks.
+        tempTable <- likertTable[, c(j, i)]
+        tempTable <-
+          tempTable[tempTable[2] != "" & !is.na(tempTable[1]) ,]
+        
+        # Check to see if any group has less than min responses.
+        testSize <-
+          tempTable %>% count(tempTable[1:2]) %>% group_by(across(.cols = 2)) %>% summarise(n = sum(n))
+        
+        tooSmall <- testSize[testSize[2] < minVal, 1]
+        print(unlist(tooSmall))
+        # print(colnames(tempTable)) # use to debug
+        
+        # Based on this, either assign a "can't be tested"/NA indicator or the values for the chi-squared test.
+        if (length(setdiff(unique(tempTable[, 2]), tooSmall)) < 2) {
+          resultsTable[a, b] <- NA
+          resultsTable[a, b + 1] <- NA
+          
+        } else {
+          tempTable <- tempTable[!(tempTable[, 2] %in% tooSmall),]
+          
+          # perform the chi squared test
+          tempChi <- chisq.test(tempTable[, 1], tempTable[, 2])
+          
+          # add statistic value and pvalue to the table.
+          resultsTable[a, b] <- tempChi$statistic
+          resultsTable[a, b + 1] <- tempChi$p.value
+          
+        } # end of else
+        
+        # Use this if you want to export a Pivot table
+        # pivot <- pivot_wider(testSize,
+        #                         names_from = colnames(testSize[1]),
+        #                         values_from = n)
+        
+        # Clean up
+        remove(testSize, tooSmall)
+        
+        # add 2 so next results go in the proper place
+        b = b + 3
+      } # end question loop
+      
+      a = a + 1 # go to next row for next set of demographic info
+      
+    } # end demographic loop
+    
+    # add corrected pvalue
+    
+    z = 1
+    for (z in 1:length(questionColumns)) {
+      tempCorrected <-
+        p.adjust(as.vector(unlist(resultsTable[, c - 1])), method = "holm")
+      
+      resultsTable[, c] <- tempCorrected
+      
+      c = c + 3
+      
+    } # end addition of corrected pvalues
+    
+    return(resultsTable)
+    
+  } # end function
+
+
+
+# This function needs to compare between categories within a question to see
+# which categories are significantly different.
+
+posthocChiSquared <-
+  function(likertTable,
+           questionColumn,
+           demographicColumn,
+           minVal = 25,
+           correction = TRUE) {
+    tempTable <- likertTable[, c(questionColumn, demographicColumn)]
+    tempTable <-
+      tempTable[tempTable[2] != "" & !is.na(tempTable[1]) , ]
+    
+    # Check to see if any group has less than min allowed responses.
+    testSize <-
+      tempTable %>% count(tempTable[1:2]) %>% group_by(across(.cols = 2)) %>% summarise(n = sum(n))
+    
+    tooSmall <- testSize[testSize[2] < minVal, 1]
+    print(unlist(tooSmall))
+    
+    # Filter out too small categories
+    tempTable <- tempTable[!(tempTable[, 2] %in% tooSmall), ]
+    # and make sure demographic is a factor (this gets lost sometimes)
+    if (is.factor(tempTable[, 2]) == FALSE) {
+      tempTable[, 2] <- as.factor(tempTable[, 2])
+    }
+    
+    # Make sure that the number of unique is more than two
+    if (nlevels(tempTable[, 2]) < 3) {
+      stop(print("Too few levels"))
+    }
+    
+    numLevels <- nlevels(tempTable[, 2])
+    
+    # create the results table
+    #create matrix with correct number of columns
+    resultsTable <- matrix(rep(999, times = numLevels ^ 2),
+                           ncol = numLevels,
+                           byrow = TRUE)
+    
+    #define column names and row names of matrix
+    tempLevels <- levels(tempTable[, 2])
+    colnames(resultsTable) <- tempLevels
+    rownames(resultsTable) <- tempLevels
+    
+    # for each [i,j] pair of factors add the pvalue for chisquared test
+    i = 1 # row
+    j = 1 # column
+    for (i in 1:numLevels) {
+      for (j in 1:numLevels) {
+        if (i != j) {
+          # subset for i and j levels
+          testTable <-
+            tempTable[tempTable[, 2] %in% tempLevels[c(i, j)] ,]
+          
+          # run test and assign pvalue to i,j spot
+          resultsTable[i, j] <-
+            chisq.test(testTable[, 1], testTable[, 2])$p.value
+          
+        } else {
+          resultsTable[i, j] <- NA
+        }
+        
+      } # end column loop
+      
+    } # end row loop
+    
+    # remove the lower triangle--can probably make this a filter but this is easy
+    resultsTable[lower.tri(resultsTable, diag = FALSE)] <- NA
+    
+    # correct pvalues
+    if (correction == TRUE) {
+      resultsTable <-
+        matrix(
+          p.adjust(as.vector(resultsTable), method = 'holm'),
+          ncol = numLevels,
+          dimnames = list(tempLevels, tempLevels)
+        )
+    }
+    #convert matrix to a tibble
+    resultsTable <- as_tibble(resultsTable, rownames = "levels")
+    
+    return(resultsTable)
+    
+  } # end function