Skip to content

Commit 4b232ed

Browse files
Fix example int8_inference_huggingface.py (#414)
* Fix example int8_inference_huggingface.py * Update examples/int8_inference_huggingface.py Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> --------- Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
1 parent cc5f8cd commit 4b232ed

File tree

1 file changed

+5
-5
lines changed

1 file changed

+5
-5
lines changed
+5-5
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,24 @@
11
import torch
2-
from transformers import AutoModelForCausalLM, AutoTokenizer
2+
from transformers import LlamaForCausalLM, LlamaTokenizer
33

44
MAX_NEW_TOKENS = 128
5-
model_name = 'decapoda-research/llama-7b-hf'
5+
model_name = 'meta-llama/Llama-2-7b-hf'
66

77
text = 'Hamburg is in which country?\n'
8-
tokenizer = AutoTokenizer.from_pretrained(model_name)
8+
tokenizer = LlamaTokenizer.from_pretrained(model_name)
99
input_ids = tokenizer(text, return_tensors="pt").input_ids
1010

11-
free_in_GB = int(torch.cuda.mem_get_info()[0]/1024**3)
1211
max_memory = f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB'
1312

1413
n_gpus = torch.cuda.device_count()
1514
max_memory = {i: max_memory for i in range(n_gpus)}
1615

17-
model = AutoModelForCausalLM.from_pretrained(
16+
model = LlamaForCausalLM.from_pretrained(
1817
model_name,
1918
device_map='auto',
2019
load_in_8bit=True,
2120
max_memory=max_memory
2221
)
22+
2323
generated_ids = model.generate(input_ids, max_length=MAX_NEW_TOKENS)
2424
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

0 commit comments

Comments
 (0)