| |
|
|
| import torch |
| import os |
|
|
| from modelscope.models.base import TorchModel |
| from modelscope.preprocessors.base import Preprocessor |
| from modelscope.pipelines.base import Model, Pipeline |
| from modelscope.utils.config import Config |
| from modelscope.pipelines.builder import PIPELINES |
| from modelscope.preprocessors.builder import PREPROCESSORS |
| from modelscope.models.builder import MODELS |
| from modelscope.preprocessors.image import load_image |
|
|
|
|
| from vlmo.utils.beit_utils import load_from_config |
|
|
|
|
| @PIPELINES.register_module( |
| "multi-modal-embeddings", module_name="multi-modal-embedding-pipeline" |
| ) |
| class MyCustomPipeline(Pipeline): |
| """Give simple introduction to this pipeline. |
| |
| Examples: |
| |
| >>> from modelscope.pipelines import pipeline |
| >>> input = "Hello, ModelScope!" |
| >>> my_pipeline = pipeline('my-task', 'my-model-id') |
| >>> result = my_pipeline(input) |
| |
| """ |
|
|
| def __init__(self, model, preprocessor=None, **kwargs): |
| """ |
| use `model` and `preprocessor` to create a custom pipeline for prediction |
| Args: |
| model: model id on modelscope hub. |
| preprocessor: the class of method be init_preprocessor |
| """ |
| super().__init__(model=model, auto_collate=False) |
| self.model_dir = model |
| self._device = "cuda" if torch.cuda.is_available() else "cpu" |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| model_config = { |
| "loss_names": {"itc": 1}, |
| "beit_version": "large", |
| "encoder_embed_dim": 1024, |
| "out_embed_dim": 1024, |
| "encoder_layers": 21, |
| "beit3_vl_layers": 3, |
| |
| "visual_mask_size": 14, |
| "tokenizer_type": "GLMChineseTokenizer", |
| "tokenizer": os.path.join(self.model_dir, "./vlmo/tokenizer"), |
| "vocab_size": 115244, |
| "whole_word_masking": False, |
| "precision": 32, |
| "test_only": True, |
| "flash_attn": True, |
| "model_path": os.path.join(self.model_dir, "m2_encoder_1B.ckpt"), |
| "modelscope": { |
| "model_id": "M2Cognition/M2_Encoder_Large" |
| }, |
| "model_file": "m2_encoder_1B.ckpt" |
| } |
| model, processors = load_from_config(model_config) |
| self.model = model |
| self.model.to(self._device).eval() |
| self._tokenizer, self._img_processor = processors |
|
|
| def _sanitize_parameters(self, **pipeline_parameters): |
| """ |
| this method should sanitize the keyword args to preprocessor params, |
| forward params and postprocess params on '__call__' or '_process_single' method |
| considered to be a normal classmethod with default implementation / output |
| |
| Default Returns: |
| Dict[str, str]: preprocess_params = {} |
| Dict[str, str]: forward_params = {} |
| Dict[str, str]: postprocess_params = pipeline_parameters |
| """ |
| return {}, pipeline_parameters, {} |
|
|
| def _check_input(self, inputs): |
| pass |
|
|
| def _check_output(self, outputs): |
| pass |
|
|
| def forward(self, forward_params): |
| """Provide default implementation using self.model and user can reimplement it""" |
| |
| labels = forward_params.get("label_list", "") |
| labels = labels.split(",") |
| if len(labels) > 1 and labels[0] != "": |
| txt_encoding = self._tokenizer( |
| labels, |
| padding="max_length", |
| truncation=True, |
| max_length=self.model.hparams.config["max_text_len"], |
| return_special_tokens_mask=True, |
| ) |
| txt_data = { |
| "text_ids": torch.tensor(txt_encoding["input_ids"]).to(self._device), |
| "text_masks": torch.tensor(txt_encoding["attention_mask"]).to( |
| self._device |
| ), |
| "text_labels": None, |
| } |
| txt_feats = self.model.infer_text(txt_data)["cls_vlffn_feats"] |
| image = forward_params["image"] |
| image = load_image(image) |
| img = self._img_processor(image).unsqueeze(0) |
| img_data = {"image": [img.to(self._device)]} |
| img_feats = self.model.infer_image(img_data)["cls_vlffn_feats"] |
| logits_per_image = self.model.logit_scale.exp() * img_feats @ txt_feats.t() |
| probs = logits_per_image.softmax(dim=-1).detach().cpu() |
| index = probs.max(dim=-1)[1][0] |
| label = labels[index] |
| return {"text": label, "scores": probs.numpy().tolist()[0]} |
| else: |
| rets = {} |
| if "text" in forward_params: |
| text = forward_params.get("text") |
| txt_encoding = self._tokenizer( |
| text, |
| padding="max_length", |
| truncation=True, |
| max_length=self.model.hparams.config["max_text_len"], |
| return_special_tokens_mask=True, |
| ) |
| txt_data = { |
| "text_ids": torch.tensor(txt_encoding["input_ids"]).to( |
| self._device |
| ), |
| "text_masks": torch.tensor(txt_encoding["attention_mask"]).to( |
| self._device |
| ), |
| "text_labels": None, |
| } |
| txt_feats = self.model.infer_text(txt_data)["cls_vlffn_feats"] |
| rets.update({"text_embedding": txt_feats.detach()}) |
| if "img" in forward_params: |
| input_img = forward_params["img"] |
| img = self._img_processor(input_img).unsqueeze(0) |
| img_data = {"image": [img.to(self._device)]} |
| img_feats = self.model.infer_image(img_data)["cls_vlffn_feats"] |
| rets.update({"img_embedding": img_feats.detach()}) |
|
|
| return rets |
|
|
| def preprocess(self, inputs): |
| return inputs |
|
|
| def postprocess(self, inputs): |
| """If current pipeline support model reuse, common postprocess |
| code should be write here. |
| |
| Args: |
| inputs: input data |
| |
| Return: |
| dict of results: a dict containing outputs of model, each |
| output should have the standard output name. |
| """ |
| return inputs |
|
|
|
|
| """ |
| # Tips: usr_config_path is the temporary save configuration location, after upload modelscope hub, it is the model_id |
| usr_config_path = "/tmp/snapdown/" |
| config = Config( |
| { |
| "framework": "pytorch", |
| "task": "multi-modal-embeddings", |
| "model": {"type": "m2-encoder"}, |
| "pipeline": {"type": "multi-modal-embedding-pipeline"}, |
| "allow_remote": True, |
| } |
| ) |
| config.dump("/tmp/snapdown/" + "configuration.json") |
| """ |
|
|
| if __name__ == "__main__": |
| from modelscope.pipelines import pipeline |
| from modelscope.preprocessors.image import load_image |
|
|
| model = "M2Cognition/M2-Encoder" |
| pipe = pipeline("multi-modal-embeddings", model=model) |
| input = { |
| "image": "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg", |
| "label_list": "杰尼龟,妙蛙种子,小火龙,皮卡丘", |
| } |
| demo = pipe(input) |
| print("demo output", demo) |
| inputs = {"text": ["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"]} |
| output = pipe(inputs) |
| print("text output", output) |
| input_img = load_image( |
| "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg" |
| ) |
| inputs = {"img": input_img} |
| img_embedding = pipe(inputs) |
| print("image output", img_embedding) |
|
|