Skip to content

Commit 6199c0a

Browse files
authored
Add files via upload
1 parent e212e5f commit 6199c0a

File tree

6 files changed

+640
-0
lines changed

6 files changed

+640
-0
lines changed

iLLM/evaluate/math_datasets.py

+93
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
from pathlib import Path
2+
from tqdm import tqdm
3+
import multiprocessing
4+
import pydra
5+
from copy import deepcopy
6+
import re
7+
from lm_eval.tasks.minerva_math.utils import (
8+
last_boxed_only_string,
9+
normalize_final_answer,
10+
get_unnormalized_answer,
11+
remove_boxed,
12+
is_equiv,
13+
)
14+
import sys
15+
sys.path.append('/n/netscratch/pehlevan_lab/Everyone/indranilhalder/language_model_inference/iLLM') # Update path to iLLM
16+
17+
from utils import load_yaml, save_yaml, EvaluateScriptConfig
18+
19+
def filter_ignores(st, regexes_to_ignore):
20+
if regexes_to_ignore is not None:
21+
for s in regexes_to_ignore:
22+
st = re.sub(s, "", st)
23+
return st
24+
25+
def is_correct_minerva(og_pred, gt):
26+
pred = normalize_final_answer(get_unnormalized_answer(og_pred))
27+
gt = normalize_final_answer(remove_boxed(last_boxed_only_string(gt)))
28+
return pred == gt or is_equiv(pred, gt)
29+
30+
31+
class ScriptConfig(EvaluateScriptConfig):
32+
dset: str = "math"
33+
34+
def is_correct(sample: str, gt_answer: str, dset: str):
35+
if dset == "math":
36+
return is_correct_minerva(sample, gt_answer)
37+
else:
38+
raise ValueError(f"Dataset {dset} not supported")
39+
40+
41+
def process_sample(config: ScriptConfig):
42+
if config.save_path.exists():
43+
return
44+
45+
result = load_yaml(config.sample_path)
46+
corrects = []
47+
48+
for sample in result["samples"]:
49+
correct = is_correct(sample, result["gt_answer"], config.dset)
50+
corrects.append(correct)
51+
52+
result["is_corrects"] = corrects
53+
54+
save_yaml(config.save_path, result)
55+
56+
57+
def get_tasks(config):
58+
sample_paths = Path(config.samples_dir).glob("*.yaml")
59+
60+
tasks = []
61+
for sample_path in tqdm(sample_paths, desc="Loading generations"):
62+
save_path = config.save_dir / sample_path.name
63+
64+
task_config = deepcopy(config)
65+
task_config.sample_path = sample_path
66+
task_config.save_path = save_path
67+
68+
tasks.append(task_config)
69+
70+
return tasks
71+
72+
73+
@pydra.main(base=ScriptConfig)
74+
def main(config: ScriptConfig):
75+
76+
tasks = get_tasks(config)
77+
tasks = sorted(
78+
tasks, key=lambda x: x.save_path
79+
)
80+
tasks = tasks[config.offset : config.limit : config.stride]
81+
82+
print(f"Evaling on {len(tasks)} problems.")
83+
84+
if config.num_workers not in [0, None]:
85+
with multiprocessing.Pool(processes=config.num_workers) as pool:
86+
_ = list(tqdm(pool.map(process_sample, tasks), total=len(tasks)))
87+
else:
88+
for task in tqdm(tasks):
89+
process_sample(task)
90+
91+
92+
if __name__ == "__main__":
93+
main()

iLLM/generate/MATH.py

+142
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
import torch
2+
from datasets import load_dataset
3+
from tqdm import tqdm
4+
import pydra
5+
import multiprocessing
6+
import random
7+
import requests
8+
from functools import partial
9+
import pandas as pd
10+
11+
import sys
12+
sys.path.append('/n/netscratch/pehlevan_lab/Everyone/indranilhalder/language_model_inference/iLLM') # Update path to iLLM
13+
14+
from generate.prompts import MATH_COT_PROMPT
15+
from utils import save_yaml, GenerateScriptConfig
16+
from generate.vllm_utils import vllm_manager
17+
18+
def run_inference(item, config: GenerateScriptConfig):
19+
outpath = config.save_dir / f"{item['id']}.yaml"
20+
if outpath.exists():
21+
return
22+
23+
prompt = MATH_COT_PROMPT + f"\n\nProblem:\n{item['problem']}\n\nSolution:"
24+
25+
url = f"http://localhost:{config.vllm_port}/generate"
26+
27+
num_samples = config.num_samples
28+
batch_size = config.batch_size
29+
30+
assert num_samples % batch_size == 0
31+
32+
samples = []
33+
for _ in tqdm(range(num_samples // batch_size), desc=f"Item {item['id']}"):
34+
35+
body = {
36+
"prompt": prompt,
37+
"max_tokens": config.max_tokens,
38+
"n": batch_size,
39+
"temperature": config.temperature,
40+
"top_p": config.top_p,
41+
"stop": config.stop_strings,
42+
"logprobs": 1,
43+
}
44+
45+
response = requests.post(url, json=body)
46+
respj = response.json()
47+
samples.extend(respj["text"])
48+
49+
out = {
50+
"level": item["level"],
51+
"type": item["type"],
52+
"prompt": prompt,
53+
"question": item["problem"],
54+
"samples": samples,
55+
"gt_answer": item["solution"],
56+
}
57+
58+
save_yaml(outpath, out)
59+
60+
61+
@pydra.main(GenerateScriptConfig)
62+
def main(
63+
config: GenerateScriptConfig,
64+
):
65+
66+
print('Test case with LLM temperature: ', config.temperature)
67+
test_dataset = list(
68+
load_dataset(
69+
"hendrycks/competition_math", "main", split="test", trust_remote_code=True
70+
)
71+
)
72+
df=pd.DataFrame(test_dataset)
73+
df =df[df['type'].str.contains('algebra', case=False, na=False)]# Mention problem type
74+
algebra_problems =df[df['level'].str.contains('Level 1', case=False, na=False)]# Mention problem level
75+
test_dataset=algebra_problems.to_dict('records')
76+
77+
train_dataset = list(
78+
load_dataset(
79+
"hendrycks/competition_math", "main", split="train", trust_remote_code=True
80+
)
81+
)
82+
83+
print(f"Number of test items: {len(test_dataset)}")
84+
print(f"Number of train items: {len(train_dataset)}")
85+
86+
random.seed(config.seed)
87+
88+
for i, data in enumerate(train_dataset):
89+
data["id"] = i
90+
91+
for i, data in enumerate(test_dataset):
92+
few_shot_items = random.sample(train_dataset, config.num_few_shot)
93+
data["id"] = i
94+
data["few_shot_items"] = few_shot_items
95+
96+
if config.randomize:
97+
random.shuffle(test_dataset)
98+
99+
100+
shuffled_limit = test_dataset
101+
102+
if config.limit is not None:
103+
limit = config.limit
104+
else:
105+
limit = len(shuffled_limit)
106+
107+
if config.stride is not None:
108+
stride = config.stride
109+
else:
110+
stride = 1
111+
112+
if config.offset is not None:
113+
offset = config.offset
114+
else:
115+
offset = 0
116+
117+
shuffled_limit = shuffled_limit[offset:limit:stride]
118+
119+
print(f"Total number of items to process: {len(shuffled_limit)}")
120+
121+
with vllm_manager(config) as vllm_port:
122+
config.vllm_port = vllm_port
123+
124+
go_func = partial(run_inference, config=config)
125+
126+
if config.num_workers not in [0, None]:
127+
with multiprocessing.Pool(config.num_workers) as pool:
128+
predictions = list(
129+
tqdm(
130+
pool.imap_unordered(go_func, test_dataset),
131+
total=len(test_dataset),
132+
)
133+
)
134+
else:
135+
predictions = []
136+
for item in tqdm(test_dataset):
137+
predictions.append(go_func(item))
138+
139+
140+
if __name__ == "__main__":
141+
main()
142+

iLLM/generate/prompts.py

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
2+
MATH_COT_PROMPT = """Problem:
3+
Find the domain of the expression $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$.}
4+
5+
Solution:
6+
The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{[2,5)}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.
7+
8+
Problem:
9+
If $\\det \\mathbf{A} = 2$ and $\\det \\mathbf{B} = 12,$ then find $\\det (\\mathbf{A} \\mathbf{B}).$
10+
11+
Solution:
12+
We have that $\\det (\\mathbf{A} \\mathbf{B}) = (\\det \\mathbf{A})(\\det \\mathbf{B}) = (2)(12) = \\boxed{24}.$\nFinal Answer: The final answer is $24$. I hope it is correct.
13+
14+
Problem:
15+
Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?
16+
17+
Solution:
18+
If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$:\n\\begin{align*}\n30n&=480\\\n\\Rightarrow\\qquad n&=480/30=\\boxed{16}\n\\end{align*}\nFinal Answer: The final answer is $16$. I hope it is correct.
19+
20+
Problem:
21+
If the system of equations\n\n\\begin{align*}\n6x-4y&=a,\\\n6y-9x &=b.\n\\end{align*}has a solution $(x, y)$ where $x$ and $y$ are both nonzero,\nfind $\\frac{a}{b},$ assuming $b$ is nonzero.
22+
23+
Solution:
24+
If we multiply the first equation by $-\\frac{3}{2}$, we obtain\n\n$$6y-9x=-\\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have\n\n$$-\\frac{3}{2}a=b\\Rightarrow\\frac{a}{b}=\\boxed{-\\frac{2}{3}}.$$\nFinal Answer: The final answer is $-\\frac{2}{3}$. I hope it is correct."""
25+

0 commit comments

Comments
 (0)