!git clone https://www.modelscope.cn/Xunzillm4cc/Xunzi-Qwen-Chat.git
!pip install transformers==4.32.0 accelerate tiktoken einops scipy transformers_stream_generator==0.0.4 peft deepspeed
!git clone https://github.com/Dao-AILab/flash-attention
!cd flash-attention && pip install .
!pip install bitsandbytes accelerate
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
# Note: The default behavior now has injection attack prevention off.
tokenizer = AutoTokenizer.from_pretrained("/content/Xunzi-Qwen-Chat", trust_remote_code=True)
max_memory = f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB'
n_gpus = torch.cuda.device_count()
max_memory = {i: max_memory for i in range(n_gpus)}
# use bf16
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-72B", device_map="auto", trust_remote_code=True, bf16=True).eval()
# use fp16
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-72B", device_map="auto", trust_remote_code=True, fp16=True).eval()
# use cpu only
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-72B", device_map="cpu", trust_remote_code=True).eval()
# use auto mode, automatically select precision based on the device.
model = AutoModelForCausalLM.from_pretrained("/content/Xunzi-Qwen-Chat", device_map="auto", load_in_4bit=True,
max_memory=max_memory, offload_folder='offload', trust_remote_code=True).eval()
# Specify hyperparameters for generation. But if you use transformers>=4.32.0, there is no need to do this.
# model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-72B", trust_remote_code=True)
text = "请列举十个中国古代最伟大的诗人:"
inputs = tokenizer(text, return_tensors='pt')
inputs = inputs.to(model.device)
pred = model.generate(**inputs)
print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True))