-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathA.2 run_annotate_document.R
61 lines (41 loc) · 1.53 KB
/
A.2 run_annotate_document.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
rm(list = ls())
#renv::load()
library("tidyverse")
library("udpipe")
library("here")
library("stm")
library("data.table")
library("tidytext")
# Getting data
# data <- read.csv("data/final_data.csv")
data <- read.csv("data/final_data_filt.csv",encoding = "UFT-8")
####################
# Annotate the data using udpipe in parallel and save it
####################
# Load English model for udpipe
ud_model <- udpipe_download_model(language = "english-ewt", overwrite = F)
ud_en <- udpipe_load_model(ud_model)
# Function for annotating split data
annotate_splits <- function(x, file) {
ud_model <- udpipe_load_model(file)
x <- as.data.table(udpipe_annotate(ud_model,
x = x$b_abstract,
doc_id = x$ID))
return(x)
}
#
# # load parallel library future.apply
library(future.apply)
# Define cores to be used
ncores <- 12L
plan(multisession, workers = ncores)
# split comments based on available cores
corpus_splitted <- split(data, (as.numeric(rownames(data))-1) %/% 200)
# Get final annotation
annotation <- future_lapply(corpus_splitted, annotate_splits, file = ud_model$file_model)
annotation <- rbindlist(annotation)
sapply(annotation, function(x) sum(is.na(x)))
annofinance <- annotation
lowering = str_count(annofinance$lemma, "[A-Z]")
annofinance$lemma[lowering <= 1] = stringr::str_to_lower(annofinance$lemma[lowering <= 1])
saveRDS(annofinance, file = "data/annofinance.rds")