1+ # AUTO LOADER
2+ # credits - some parts "borrowed" from oobabooga
3+ # https://github.com/oobabooga/text-generation-webui/blob/main/modules/models.py
4+
5+ # (A) LOAD SETTINGS
6+ import a_settings as set
7+
8+ # (B) MANUALLY SPECIFIED MODEL - USE LLAMA CPP
9+ if hasattr (set , "model_file" ):
10+ from langchain .llms import LlamaCpp
11+ llm = LlamaCpp (
12+ model_path = set .model_file ,
13+ ** set .model_args
14+ )
15+
16+ # (C) HUGGING FACE
17+ else :
18+ # (C1) IMPORT TRANSFORMERS MODULES
19+ import torch , psutil
20+ from transformers import AutoConfig , AutoModelForCausalLM , AutoTokenizer , pipeline
21+ from accelerate import infer_auto_device_map , init_empty_weights
22+ from langchain import HuggingFacePipeline
23+
24+ # (C2) HELPER - AUTO MAX MEMORY CALCULATION
25+ def max_mem ():
26+ # (C2-1) GPU MEMORY
27+ total = (torch .cuda .get_device_properties (0 ).total_memory / (1024 * 1024 ))
28+ suggestion = round ((total - 1000 ) / 1000 ) * 1000
29+ if total - suggestion < 800 :
30+ suggestion -= 1000
31+ suggestion = int (round (suggestion / 1000 ))
32+ max = { 0 : f"{ suggestion } GiB" }
33+
34+ # (C2-2) CPU MEMORY
35+ total = (psutil .virtual_memory ().available / (1024 * 1024 ))
36+ suggestion = round ((total - 1000 ) / 1000 ) * 1000
37+ if total - suggestion < 800 :
38+ suggestion -= 1000
39+ suggestion = int (round (suggestion / 1000 ))
40+ max ["cpu" ] = f"{ suggestion } GiB"
41+
42+ # (C2-3) RETURN CALCULATED MEMORY
43+ return max
44+
45+ # (C3) INIT MODEL PARAMS
46+ params = {
47+ "low_cpu_mem_usage" : True ,
48+ "device_map" : "auto"
49+ }
50+
51+ # (C4) GPU ACCELERATED
52+ if set .gpu :
53+ config = AutoConfig .from_pretrained (set .model_name )
54+ with init_empty_weights ():
55+ model = AutoModelForCausalLM .from_config (config )
56+ model .tie_weights ()
57+ params ["device_map" ] = infer_auto_device_map (
58+ model ,
59+ dtype = config .torch_dtype ,
60+ max_memory = max_mem (),
61+ no_split_module_classes = model ._no_split_modules
62+ )
63+
64+ # (C5) CPU ONLY
65+ else :
66+ params ["torch_dtype" ] = torch .float32
67+
68+ # (C6) LOAD MODEL
69+ model = AutoModelForCausalLM .from_pretrained (set .model_name , ** params )
70+
71+ # (C7) LLM/PIPE
72+ llm = HuggingFacePipeline (pipeline = pipeline (
73+ task = "text-generation" ,
74+ model = model ,
75+ tokenizer = AutoTokenizer .from_pretrained (set .model_name ),
76+ ** set .model_args
77+ ))
0 commit comments