I-Halder
diff --git a/‎iLLM/evaluate/math_datasets.py
+93 b/‎iLLM/evaluate/math_datasets.py
+93
diff --git a/‎iLLM/generate/MATH.py
+142 b/‎iLLM/generate/MATH.py
+142
diff --git a/‎iLLM/generate/prompts.py
+25 b/‎iLLM/generate/prompts.py
+25
@@ -0,0 +1,93 @@
+from pathlib import Path
+from tqdm import tqdm
+import multiprocessing
+import pydra
+from copy import deepcopy
+import re
+from lm_eval.tasks.minerva_math.utils import (
+    last_boxed_only_string,
+    normalize_final_answer,
+    get_unnormalized_answer,
+    remove_boxed,
+    is_equiv,
+)
+import sys
+sys.path.append('/n/netscratch/pehlevan_lab/Everyone/indranilhalder/language_model_inference/iLLM') # Update path to iLLM
+
+from utils import load_yaml, save_yaml, EvaluateScriptConfig
+
+def filter_ignores(st, regexes_to_ignore):
+    if regexes_to_ignore is not None:
+        for s in regexes_to_ignore:
+            st = re.sub(s, "", st)
+    return st
+
+def is_correct_minerva(og_pred, gt):
+    pred = normalize_final_answer(get_unnormalized_answer(og_pred))
+    gt = normalize_final_answer(remove_boxed(last_boxed_only_string(gt)))
+    return pred == gt or is_equiv(pred, gt)
+
+
+class ScriptConfig(EvaluateScriptConfig):
+    dset: str = "math"
+
+def is_correct(sample: str, gt_answer: str, dset: str):
+    if dset == "math":
+        return is_correct_minerva(sample, gt_answer)
+    else:
+        raise ValueError(f"Dataset {dset} not supported")
+
+
+def process_sample(config: ScriptConfig):
+    if config.save_path.exists():
+        return
+
+    result = load_yaml(config.sample_path)
+    corrects = []
+
+    for sample in result["samples"]:
+        correct = is_correct(sample, result["gt_answer"], config.dset)
+        corrects.append(correct)
+
+    result["is_corrects"] = corrects
+
+    save_yaml(config.save_path, result)
+
+
+def get_tasks(config):
+    sample_paths = Path(config.samples_dir).glob("*.yaml")
+
+    tasks = []
+    for sample_path in tqdm(sample_paths, desc="Loading generations"):
+        save_path = config.save_dir / sample_path.name
+
+        task_config = deepcopy(config)
+        task_config.sample_path = sample_path
+        task_config.save_path = save_path
+
+        tasks.append(task_config)
+
+    return tasks
+
+
+@pydra.main(base=ScriptConfig)
+def main(config: ScriptConfig):
+
+    tasks = get_tasks(config)
+    tasks = sorted(
+        tasks, key=lambda x: x.save_path
+    ) 
+    tasks = tasks[config.offset : config.limit : config.stride]
+
+    print(f"Evaling on {len(tasks)} problems.")
+
+    if config.num_workers not in [0, None]:
+        with multiprocessing.Pool(processes=config.num_workers) as pool:
+            _ = list(tqdm(pool.map(process_sample, tasks), total=len(tasks)))
+    else:
+        for task in tqdm(tasks):
+            process_sample(task)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,142 @@
+import torch
+from datasets import load_dataset
+from tqdm import tqdm
+import pydra
+import multiprocessing
+import random
+import requests
+from functools import partial
+import pandas as pd
+
+import sys
+sys.path.append('/n/netscratch/pehlevan_lab/Everyone/indranilhalder/language_model_inference/iLLM') # Update path to iLLM
+
+from generate.prompts import MATH_COT_PROMPT
+from utils import save_yaml, GenerateScriptConfig
+from generate.vllm_utils import vllm_manager
+
+def run_inference(item, config: GenerateScriptConfig):
+    outpath = config.save_dir / f"{item['id']}.yaml"
+    if outpath.exists():
+        return
+
+    prompt = MATH_COT_PROMPT + f"\n\nProblem:\n{item['problem']}\n\nSolution:"
+
+    url = f"http://localhost:{config.vllm_port}/generate"
+
+    num_samples = config.num_samples
+    batch_size = config.batch_size
+
+    assert num_samples % batch_size == 0
+
+    samples = []
+    for _ in tqdm(range(num_samples // batch_size), desc=f"Item {item['id']}"):
+
+        body = {
+            "prompt": prompt,
+            "max_tokens": config.max_tokens,
+            "n": batch_size,
+            "temperature": config.temperature,
+            "top_p": config.top_p,
+            "stop": config.stop_strings,
+            "logprobs": 1,
+        }
+
+        response = requests.post(url, json=body)
+        respj = response.json()
+        samples.extend(respj["text"])
+
+    out = {
+        "level": item["level"],
+        "type": item["type"],
+        "prompt": prompt,
+        "question": item["problem"],
+        "samples": samples,
+        "gt_answer": item["solution"],
+    }
+
+    save_yaml(outpath, out)
+
+
+@pydra.main(GenerateScriptConfig)
+def main(
+    config: GenerateScriptConfig,
+):
+
+    print('Test case with LLM temperature: ', config.temperature)
+    test_dataset = list(
+        load_dataset(
+            "hendrycks/competition_math", "main", split="test", trust_remote_code=True
+        )
+    )
+    df=pd.DataFrame(test_dataset)
+    df =df[df['type'].str.contains('algebra', case=False, na=False)]# Mention problem type
+    algebra_problems =df[df['level'].str.contains('Level 1', case=False, na=False)]# Mention problem level
+    test_dataset=algebra_problems.to_dict('records')
+
+    train_dataset = list(
+        load_dataset(
+            "hendrycks/competition_math", "main", split="train", trust_remote_code=True
+        )
+    )
+
+    print(f"Number of test items: {len(test_dataset)}")
+    print(f"Number of train items: {len(train_dataset)}")
+
+    random.seed(config.seed)
+
+    for i, data in enumerate(train_dataset):
+        data["id"] = i
+
+    for i, data in enumerate(test_dataset):
+        few_shot_items = random.sample(train_dataset, config.num_few_shot)
+        data["id"] = i
+        data["few_shot_items"] = few_shot_items
+    
+    if config.randomize: 
+        random.shuffle(test_dataset)
+    
+
+    shuffled_limit = test_dataset
+
+    if config.limit is not None:
+        limit = config.limit
+    else:
+        limit = len(shuffled_limit)
+
+    if config.stride is not None:
+        stride = config.stride
+    else:
+        stride = 1
+
+    if config.offset is not None:
+        offset = config.offset
+    else:
+        offset = 0
+
+    shuffled_limit = shuffled_limit[offset:limit:stride]
+
+    print(f"Total number of items to process: {len(shuffled_limit)}")
+
+    with vllm_manager(config) as vllm_port:
+        config.vllm_port = vllm_port
+
+        go_func = partial(run_inference, config=config)
+
+        if config.num_workers not in [0, None]:
+            with multiprocessing.Pool(config.num_workers) as pool:
+                predictions = list(
+                    tqdm(
+                        pool.imap_unordered(go_func, test_dataset),
+                        total=len(test_dataset),
+                    )
+                )
+        else:
+            predictions = []
+            for item in tqdm(test_dataset):
+                predictions.append(go_func(item))
+
+
+if __name__ == "__main__":
+    main()
+
@@ -0,0 +1,25 @@
+
+MATH_COT_PROMPT = """Problem:
+Find the domain of the expression  $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$.}
+
+Solution:
+The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{[2,5)}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.
+
+Problem:
+If $\\det \\mathbf{A} = 2$ and $\\det \\mathbf{B} = 12,$ then find $\\det (\\mathbf{A} \\mathbf{B}).$
+
+Solution:
+We have that $\\det (\\mathbf{A} \\mathbf{B}) = (\\det \\mathbf{A})(\\det \\mathbf{B}) = (2)(12) = \\boxed{24}.$\nFinal Answer: The final answer is $24$. I hope it is correct.
+
+Problem:
+Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?
+
+Solution:
+If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight.  If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight.  Equating this to 480 pounds, we can solve for $n$:\n\\begin{align*}\n30n&=480\\\n\\Rightarrow\\qquad n&=480/30=\\boxed{16}\n\\end{align*}\nFinal Answer: The final answer is $16$. I hope it is correct.
+
+Problem:
+If the system of equations\n\n\\begin{align*}\n6x-4y&=a,\\\n6y-9x &=b.\n\\end{align*}has a solution $(x, y)$ where $x$ and $y$ are both nonzero,\nfind $\\frac{a}{b},$ assuming $b$ is nonzero.
+
+Solution:
+If we multiply the first equation by $-\\frac{3}{2}$, we obtain\n\n$$6y-9x=-\\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have\n\n$$-\\frac{3}{2}a=b\\Rightarrow\\frac{a}{b}=\\boxed{-\\frac{2}{3}}.$$\nFinal Answer: The final answer is $-\\frac{2}{3}$. I hope it is correct."""
+