Spaces:

konieshadow
/

podcast-transcriber

Sleeping

App Files Files Community

konieshadow commited on May 28

Commit

642af4d

1 Parent(s): 924aa01

修复llm调用问题

Browse files

Files changed (4) hide show

examples/simple_llm.py +3 -5
src/podcast_transcribe/llm/llm_base.py +6 -49
src/podcast_transcribe/llm/llm_gemma_transfomers.py +0 -9
src/podcast_transcribe/llm/llm_router.py +2 -32

examples/simple_llm.py CHANGED Viewed

@@ -15,17 +15,15 @@ if __name__ == "__main__":
     try:
         # model_name = "mlx-community/gemma-3-12b-it-4bit-DWQ"
         model_name = "google/gemma-3-4b-it"
-        use_4bit_quantization = False
-        device = "mps"
         # gemma_chat = GemmaMLXChatCompletion(model_name="mlx-community/gemma-3-12b-it-4bit-DWQ")
         # 或者，如果您有更小、更快的模型，可以尝试使用，例如："mlx-community/gemma-2b-it-8bit"
         if model_name.startswith("mlx-community"):
             gemma_chat = GemmaMLXChatCompletion(model_name=model_name)
         else:
-            # 如果设备是 mps，则使用 float32 以增加稳定性
-            dtype_to_use = torch.float32 if device == "mps" else torch.float16
-            gemma_chat = GemmaTransformersChatCompletion(model_name=model_name, use_4bit_quantization=use_4bit_quantization, device=device, torch_dtype=dtype_to_use)
         print("\n--- 示例 1: 简单用户查询 ---")
         messages_example1 = [

     try:
         # model_name = "mlx-community/gemma-3-12b-it-4bit-DWQ"
         model_name = "google/gemma-3-4b-it"
+        device = "cuda"
         # gemma_chat = GemmaMLXChatCompletion(model_name="mlx-community/gemma-3-12b-it-4bit-DWQ")
         # 或者，如果您有更小、更快的模型，可以尝试使用，例如："mlx-community/gemma-2b-it-8bit"
         if model_name.startswith("mlx-community"):
             gemma_chat = GemmaMLXChatCompletion(model_name=model_name)
         else:
+            # 如果设备是 mps 或 cuda，则使用 float32 以增加稳定性
+            gemma_chat = GemmaTransformersChatCompletion(model_name=model_name, device=device)
         print("\n--- 示例 1: 简单用户查询 ---")
         messages_example1 = [

src/podcast_transcribe/llm/llm_base.py CHANGED Viewed

@@ -174,45 +174,16 @@ class TransformersBaseChatCompletion(BaseChatCompletion):
     def __init__(
         self,
         model_name: str,
-        use_4bit_quantization: bool = False,
         device_map: Optional[str] = None,
         device: Optional[str] = None,
-        trust_remote_code: bool = True,
-        torch_dtype: Optional[torch.dtype] = None
     ):
         super().__init__(model_name)
-        self.use_4bit_quantization = use_4bit_quantization
         self.device_map = device_map
-        self.trust_remote_code = trust_remote_code
-        self.torch_dtype = torch_dtype or torch.float16
         self.device = device
         # 加载模型和分词器
         self._load_model_and_tokenizer()
-    def _get_quantization_config(self):
-        """获取量化配置"""
-        if not self.use_4bit_quantization:
-            return None
-        if self.device and self.device.type == "mps":
-            print("警告: MPS 设备不支持 4bit 量化，将禁用量化")
-            self.use_4bit_quantization = False
-            return None
-        # 导入量化配置
-        try:
-            from transformers import BitsAndBytesConfig
-        except ImportError:
-            raise ImportError("请先安装 bitsandbytes 库: pip install bitsandbytes")
-        return BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_compute_dtype=self.torch_dtype,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_use_double_quant=True,
-        )
     def _load_tokenizer(self):
         """加载分词器"""
         try:
@@ -222,7 +193,7 @@ class TransformersBaseChatCompletion(BaseChatCompletion):
         self.tokenizer = AutoTokenizer.from_pretrained(
             self.model_name,
-            trust_remote_code=self.trust_remote_code
         )
         # 设置 pad_token 如果不存在
@@ -237,22 +208,14 @@ class TransformersBaseChatCompletion(BaseChatCompletion):
             raise ImportError("请先安装 transformers 库: pip install transformers")
         print(f"正在加载模型: {self.model_name}")
-        print(f"4bit量化: {'启用' if self.use_4bit_quantization else '禁用'}")
         print(f"目标设备: {self.device}")
         print(f"设备映射: {self.device_map}")
         # 配置模型加载参数
         model_kwargs = {
-            "trust_remote_code": self.trust_remote_code,
-            "torch_dtype": self.torch_dtype,
         }
-        # 处理量化配置
-        quantization_config = self._get_quantization_config()
-        if quantization_config:
-            model_kwargs["quantization_config"] = quantization_config
-            print(f"使用 4bit 量化配置")
         # 处理设备映射
         if self.device_map is not None:
             if self.device and self.device.type == "mps":
@@ -267,10 +230,9 @@ class TransformersBaseChatCompletion(BaseChatCompletion):
         )
         # MPS 或手动设备管理
-        if self.device_map is None or (self.device and self.device.type == "mps"):
-            if not self.use_4bit_quantization:
-                print(f"手动移动模型到设备: {self.device}")
-                self.model = self.model.to(self.device)
         print(f"模型 {self.model_name} 加载成功")
@@ -287,11 +249,8 @@ class TransformersBaseChatCompletion(BaseChatCompletion):
     def _print_error_hints(self):
         """打印错误提示信息"""
         print("请确保模型名称正确且可访问。")
-        if self.use_4bit_quantization:
-            print("如果使用量化，请确保已安装 bitsandbytes 库: pip install bitsandbytes")
-        if self.device and self.device.type == "mps":
             print("MPS 设备注意事项:")
-            print("- 不支持 4bit 量化")
             print("- 不支持 device_map")
             print("- 确保 PyTorch 版本支持 MPS")
@@ -352,12 +311,10 @@ class TransformersBaseChatCompletion(BaseChatCompletion):
         """获取模型信息"""
         model_info = {
             "model_name": self.model_name,
-            "use_4bit_quantization": self.use_4bit_quantization,
             "device": str(self.device),
             "device_type": self.device.type,
             "device_map": self.device_map,
             "model_type": "transformers",
-            "torch_dtype": str(self.torch_dtype),
             "mps_available": torch.backends.mps.is_available() if hasattr(torch.backends, 'mps') else False,
             "cuda_available": torch.cuda.is_available(),
         }

     def __init__(
         self,
         model_name: str,
         device_map: Optional[str] = None,
         device: Optional[str] = None,
     ):
         super().__init__(model_name)
         self.device_map = device_map
         self.device = device
         # 加载模型和分词器
         self._load_model_and_tokenizer()
     def _load_tokenizer(self):
         """加载分词器"""
         try:
         self.tokenizer = AutoTokenizer.from_pretrained(
             self.model_name,
+            trust_remote_code=True
         )
         # 设置 pad_token 如果不存在
             raise ImportError("请先安装 transformers 库: pip install transformers")
         print(f"正在加载模型: {self.model_name}")
         print(f"目标设备: {self.device}")
         print(f"设备映射: {self.device_map}")
         # 配置模型加载参数
         model_kwargs = {
+            "trust_remote_code": True,
         }
         # 处理设备映射
         if self.device_map is not None:
             if self.device and self.device.type == "mps":
         )
         # MPS 或手动设备管理
+        if self.device_map is None:
+            print(f"手动移动模型到设备: {self.device}")
+            self.model = self.model.to(self.device)
         print(f"模型 {self.model_name} 加载成功")
     def _print_error_hints(self):
         """打印错误提示信息"""
         print("请确保模型名称正确且可访问。")
+        if self.device and self.device == "mps":
             print("MPS 设备注意事项:")
             print("- 不支持 device_map")
             print("- 确保 PyTorch 版本支持 MPS")
         """获取模型信息"""
         model_info = {
             "model_name": self.model_name,
             "device": str(self.device),
             "device_type": self.device.type,
             "device_map": self.device_map,
             "model_type": "transformers",
             "mps_available": torch.backends.mps.is_available() if hasattr(torch.backends, 'mps') else False,
             "cuda_available": torch.cuda.is_available(),
         }

src/podcast_transcribe/llm/llm_gemma_transfomers.py CHANGED Viewed

@@ -10,20 +10,14 @@ class GemmaTransformersChatCompletion(TransformersBaseChatCompletion):
     def __init__(
         self,
         model_name: str = "google/gemma-3-4b-it",
-        use_4bit_quantization: bool = False,
         device_map: Optional[str] = None,
         device: Optional[str] = None,
-        trust_remote_code: bool = True,
-        torch_dtype: Optional[torch.dtype] = None
     ):
         # Gemma 使用 float16 作为默认数据类型
         super().__init__(
             model_name=model_name,
-            use_4bit_quantization=use_4bit_quantization,
             device_map=device_map,
             device=device,
-            trust_remote_code=trust_remote_code,
-            torch_dtype=torch_dtype if torch_dtype is not None else torch.float16
         )
     def _print_error_hints(self):
@@ -38,7 +32,6 @@ class GemmaTransformersChatCompletion(TransformersBaseChatCompletion):
 # 为了保持向后兼容性，也可以提供一个简化的工厂函数
 def create_gemma_transformers_client(
     model_name: str = "google/gemma-3-4b-it",
-    use_4bit_quantization: bool = False,
     device: Optional[str] = None,
     **kwargs
 ) -> GemmaTransformersChatCompletion:
@@ -47,7 +40,6 @@ def create_gemma_transformers_client(
     Args:
         model_name: 模型名称
-        use_4bit_quantization: 是否使用4bit量化
         device: 指定设备 ("cpu", "cuda", "mps", 等)
         **kwargs: 其他传递给构造函数的参数
@@ -56,7 +48,6 @@ def create_gemma_transformers_client(
     """
     return GemmaTransformersChatCompletion(
         model_name=model_name,
-        use_4bit_quantization=use_4bit_quantization,
         device=device,
         **kwargs
     )

     def __init__(
         self,
         model_name: str = "google/gemma-3-4b-it",
         device_map: Optional[str] = None,
         device: Optional[str] = None,
     ):
         # Gemma 使用 float16 作为默认数据类型
         super().__init__(
             model_name=model_name,
             device_map=device_map,
             device=device,
         )
     def _print_error_hints(self):
 # 为了保持向后兼容性，也可以提供一个简化的工厂函数
 def create_gemma_transformers_client(
     model_name: str = "google/gemma-3-4b-it",
     device: Optional[str] = None,
     **kwargs
 ) -> GemmaTransformersChatCompletion:
     Args:
         model_name: 模型名称
         device: 指定设备 ("cpu", "cuda", "mps", 等)
         **kwargs: 其他传递给构造函数的参数
     """
     return GemmaTransformersChatCompletion(
         model_name=model_name,
         device=device,
         **kwargs
     )

src/podcast_transcribe/llm/llm_router.py CHANGED Viewed

@@ -38,8 +38,7 @@ class LLMRouter:
                 "class_name": "GemmaTransformersChatCompletion",
                 "default_model": "google/gemma-3-4b-it",
                 "supported_params": [
-                    "model_name", "use_4bit_quantization", "device_map",
-                    "device", "trust_remote_code", "torch_dtype"
                 ],
                 "description": "基于Transformers库的Gemma聊天完成实现"
             }
@@ -191,7 +190,7 @@ class LLMRouter:
             max_tokens: 最大生成token数
             top_p: nucleus采样参数
             model: 可选的模型名称，如果提供则覆盖默认model_name
-            **kwargs: 其他参数，如device、use_4bit_quantization等
         返回:
             聊天完成响应字典
@@ -207,12 +206,6 @@ class LLMRouter:
             if model is not None:
                 kwargs["model_name"] = model
-            # 如果设备是 mps，并且是 transformers provider，则强制使用 float32
-            current_device = kwargs.get("device")
-            if current_device == "mps":
-                if provider == "gemma-transformers":
-                    kwargs["torch_dtype"] = torch.float32
             # 获取或创建LLM实例
             llm_instance = self._get_or_create_instance(provider, **kwargs)
@@ -271,12 +264,6 @@ class LLMRouter:
             if model is not None:
                 kwargs["model_name"] = model
-            # 如果设备是 mps，并且是 transformers provider，则强制使用 float32
-            current_device = kwargs.get("device")
-            if current_device == "mps":
-                if provider == "gemma-transformers":
-                    kwargs["torch_dtype"] = torch.float32
             # 获取或创建LLM实例
             llm_instance = self._get_or_create_instance(provider, **kwargs)
@@ -378,9 +365,7 @@ def chat_completion(
     top_p: float = 1.0,
     model: Optional[str] = None,
     device: Optional[str] = None,
-    use_4bit_quantization: bool = False,
     device_map: Optional[str] = None,
-    trust_remote_code: bool = True,
     **kwargs
 ) -> Dict[str, Any]:
     """
@@ -396,9 +381,7 @@ def chat_completion(
         top_p: nucleus采样参数 (0.0-1.0)
         model: 模型名称，如果不指定则使用默认模型
         device: 推理设备，'cpu'、'cuda'、'mps'（仅transformers provider支持）
-        use_4bit_quantization: 是否使用4bit量化（仅transformers provider支持）
         device_map: 设备映射配置（仅transformers provider支持）
-        trust_remote_code: 是否信任远程代码（仅transformers provider支持）
         **kwargs: 其他参数
     返回:
@@ -417,7 +400,6 @@ def chat_completion(
             provider="gemma-transformers",
             model="google/gemma-3-4b-it",
             device="cuda",
-            use_4bit_quantization=True
         )
         # 自定义参数
@@ -437,12 +419,8 @@ def chat_completion(
         params["model_name"] = model
     if device is not None:
         params["device"] = device
-    if use_4bit_quantization:
-        params["use_4bit_quantization"] = use_4bit_quantization
     if device_map:
         params["device_map"] = device_map
-    if not trust_remote_code:
-        params["trust_remote_code"] = trust_remote_code
     return _router.chat_completion(
         messages=messages,
@@ -463,9 +441,7 @@ def reasoning_completion(
     top_p: float = 0.9,
     model: Optional[str] = None,
     device: Optional[str] = None,
-    use_4bit_quantization: bool = False,
     device_map: Optional[str] = None,
-    trust_remote_code: bool = True,
     extract_reasoning_steps: bool = True,
     **kwargs
 ) -> Dict[str, Any]:
@@ -480,9 +456,7 @@ def reasoning_completion(
         top_p: nucleus采样参数
         model: 模型名称，如果不指定则使用默认模型
         device: 推理设备
-        use_4bit_quantization: 是否使用4bit量化
         device_map: 设备映射配置
-        trust_remote_code: 是否信任远程代码
         extract_reasoning_steps: 是否提取推理步骤
         **kwargs: 其他参数
@@ -510,12 +484,8 @@ def reasoning_completion(
         params["model_name"] = model
     if device is not None:
         params["device"] = device
-    if use_4bit_quantization:
-        params["use_4bit_quantization"] = use_4bit_quantization
     if device_map:
         params["device_map"] = device_map
-    if not trust_remote_code:
-        params["trust_remote_code"] = trust_remote_code
     return _router.reasoning_completion(
         messages=messages,

                 "class_name": "GemmaTransformersChatCompletion",
                 "default_model": "google/gemma-3-4b-it",
                 "supported_params": [
+                    "model_name", "device_map",
                 ],
                 "description": "基于Transformers库的Gemma聊天完成实现"
             }
             max_tokens: 最大生成token数
             top_p: nucleus采样参数
             model: 可选的模型名称，如果提供则覆盖默认model_name
+            **kwargs: 其他参数，如device等
         返回:
             聊天完成响应字典
             if model is not None:
                 kwargs["model_name"] = model
             # 获取或创建LLM实例
             llm_instance = self._get_or_create_instance(provider, **kwargs)
             if model is not None:
                 kwargs["model_name"] = model
             # 获取或创建LLM实例
             llm_instance = self._get_or_create_instance(provider, **kwargs)
     top_p: float = 1.0,
     model: Optional[str] = None,
     device: Optional[str] = None,
     device_map: Optional[str] = None,
     **kwargs
 ) -> Dict[str, Any]:
     """
         top_p: nucleus采样参数 (0.0-1.0)
         model: 模型名称，如果不指定则使用默认模型
         device: 推理设备，'cpu'、'cuda'、'mps'（仅transformers provider支持）
         device_map: 设备映射配置（仅transformers provider支持）
         **kwargs: 其他参数
     返回:
             provider="gemma-transformers",
             model="google/gemma-3-4b-it",
             device="cuda",
         )
         # 自定义参数
         params["model_name"] = model
     if device is not None:
         params["device"] = device
     if device_map:
         params["device_map"] = device_map
     return _router.chat_completion(
         messages=messages,
     top_p: float = 0.9,
     model: Optional[str] = None,
     device: Optional[str] = None,
     device_map: Optional[str] = None,
     extract_reasoning_steps: bool = True,
     **kwargs
 ) -> Dict[str, Any]:
         top_p: nucleus采样参数
         model: 模型名称，如果不指定则使用默认模型
         device: 推理设备
         device_map: 设备映射配置
         extract_reasoning_steps: 是否提取推理步骤
         **kwargs: 其他参数
         params["model_name"] = model
     if device is not None:
         params["device"] = device
     if device_map:
         params["device_map"] = device_map
     return _router.reasoning_completion(
         messages=messages,