-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathParseResults.py
63 lines (53 loc) · 2.4 KB
/
ParseResults.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import pandas as pd
import os
import json
class ParseResults():
def __init__(self, base_dataframe_path, text_field, text_id_field , results_path="results/"):
"""
results_path: path to folder with the jsonl files
base_dataframe: the ground truth dataframe, from which we generated the initial requests
"""
self.results_path = results_path
self.base_dataframe_path = base_dataframe_path
self.text_field = text_field
self.text_id_field = text_id_field
self.base_dataframe = pd.read_csv(self.base_dataframe_path,lineterminator="\n")
print("Length of base dataframe:", len(self.base_dataframe))
def concat_results(self, save_df = True):
"""
Concatenates all results into a single dataframe. In this case, I opted
to iterate with readlines because pd.read_json(,lines=True) returned errors
save_df: wheter or not to save the dataframe in the current folder
"""
results = os.listdir(self.results_path)
results = [x for x in results if x.endswith(".jsonl")]
records = []
for f in results:
path = self.results_path+f
with open(path, "r") as f:
for line in f.readlines():
data = json.loads(line)
records.append(data)
self.perspective_df = pd.DataFrame.from_records(records)
if(save_df):
self.perspective_df.to_csv("perspective_df.csv",index=False)
print("Number of corretly processed entries:", len(self.perspective_df))
def find_missing_ids(self):
"""
Finds which ids need to be requested again
returns the list of said ids
"""
try:
downloaded = list(self.perspective_df["comment_id"].values)
except Exception as e:
print(e," perspective_df has not been initialized yet! run concat_results beforehand")
base_comparison = list(self.base_dataframe["comment_id"].values)
missing_ids = set(base_comparison) - set(downloaded)
return missing_ids
def generate_retry_dataframe(self, missing_ids):
"""
Given a list of ids that were not correctly computed, returns a new dataframe to be submitted
to the API again
"""
retry_df = self.base_dataframe[self.base_dataframe[self.text_id_field].isin(missing_ids)]
return retry_df