Upload 31 files

Browse files

Files changed (8) hide show

README_CUSTOM.md +87 -0
api.py +94 -0
app.py +226 -0
custom_st.py +194 -151
inference.py +132 -0
model_card.md +85 -0
pipeline.py +98 -0
requirements.txt +14 -0

README_CUSTOM.md ADDED Viewed

	@@ -0,0 +1,87 @@

+# Custom Embedding Model
+This repository contains a custom embedding model based on Jina Embeddings V4, optimized for generating embeddings for text, images, and visual documents.
+## Features
+- Multimodal embeddings for text and images
+- Multilingual support (30+ languages)
+- Task-specific adapters (retrieval, text-matching, code)
+- Flexible embedding dimensions
+## Setup
+1. Install the required dependencies:
+```bash
+pip install -r requirements.txt
+```
+2. You can use the model in different ways:
+### Using the Handler
+```python
+from handler import ModelHandler
+# Initialize the model
+model_handler = ModelHandler()
+model_handler.initialize(None)
+# Process text inputs
+text_inputs = ["Your text here", "Another example"]
+features = model_handler.preprocess({"body": {"inputs": text_inputs}})
+result = model_handler.inference(features)
+print(result)  # {"embeddings": [...]}
+```
+### Using the API
+Run the API server:
+```bash
+python api.py
+```
+Then make API requests:
+```python
+import requests
+import json
+response = requests.post(
+    "http://localhost:8000/embeddings",
+    json={
+        "inputs": [{"text": "Your text here"}, {"text": "Another example"}],
+        "task": "retrieval"
+    }
+)
+print(response.json())  # {"embeddings": [...]}
+```
+### Using the Pipeline
+```python
+from pipeline import load_pipeline
+# Load the pipeline
+pipeline = load_pipeline("path/to/model")
+# Generate embeddings
+embeddings = pipeline("Your text here", task="retrieval")
+print(embeddings.shape)  # (1, 2048)
+```
+## Demo UI
+You can also run a Gradio demo UI:
+```bash
+python app.py
+```
+This will start a web interface for testing embeddings and comparing similarities between text and images.
+## License
+This model is available under the same terms as the original model it's based on. Please refer to the license information for details.

api.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from fastapi import FastAPI, Request, Response, HTTPException
+from fastapi.responses import JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+import uvicorn
+import torch
+import json
+import base64
+from io import BytesIO
+from PIL import Image
+import requests
+from typing import List, Dict, Any, Union, Optional
+from pydantic import BaseModel, Field
+import numpy as np
+import os
+# Import handler
+from handler import ModelHandler
+app = FastAPI(title="Embedding Model API")
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Initialize model handler
+model_handler = ModelHandler()
+model_handler.initialize(None)  # We'll handle context manually
+# Define request/response models
+class TextInput(BaseModel):
+    text: str = Field(..., description="The text to generate embeddings for")
+class ImageInput(BaseModel):
+    image: str = Field(..., description="URL or base64-encoded image to generate embeddings for")
+class EmbeddingRequest(BaseModel):
+    inputs: List[Union[TextInput, ImageInput]] = Field(..., description="List of text or image inputs")
+    task: str = Field("retrieval", description="Task type: retrieval, text-matching, or code")
+class EmbeddingResponse(BaseModel):
+    embeddings: List[List[float]] = Field(..., description="List of embeddings")
+@app.get("/")
+async def root():
+    return {"message": "Embedding Model API is running"}
+@app.post("/embeddings", response_model=EmbeddingResponse)
+async def create_embeddings(request: EmbeddingRequest):
+    try:
+        inputs = []
+        # Process inputs
+        for item in request.inputs:
+            if hasattr(item, "text"):
+                inputs.append(item.text)
+            elif hasattr(item, "image"):
+                image_data = item.image
+                if image_data.startswith("http"):
+                    # URL
+                    response = requests.get(image_data)
+                    image = Image.open(BytesIO(response.content)).convert("RGB")
+                elif image_data.startswith("data:image"):
+                    # Base64
+                    image_b64 = image_data.split(",")[1]
+                    image = Image.open(BytesIO(base64.b64decode(image_b64))).convert("RGB")
+                else:
+                    raise HTTPException(status_code=400, detail="Invalid image format")
+                inputs.append(image)
+        # Get embeddings
+        features = model_handler.model.tokenize(inputs)
+        outputs = model_handler.model.forward(features, task=request.task)
+        embeddings = outputs.get("sentence_embedding", None)
+        if embeddings is None:
+            raise HTTPException(status_code=500, detail="Failed to generate embeddings")
+        # Convert to list for JSON serialization
+        embeddings_list = embeddings.cpu().numpy().tolist()
+        return {"embeddings": embeddings_list}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    # Run the API server
+    port = int(os.environ.get("PORT", 8000))
+    uvicorn.run(app, host="0.0.0.0", port=port)

app.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import os
+import torch
+import gradio as gr
+import numpy as np
+from typing import List, Union, Optional
+from PIL import Image
+import requests
+from io import BytesIO
+import base64
+# Import your handler
+from handler import ModelHandler
+# Create model handler instance
+model_handler = ModelHandler()
+model_handler.initialize(None)  # We'll handle device placement manually
+def cosine_similarity(embedding1, embedding2):
+    """Calculate cosine similarity between two embeddings"""
+    embedding1 = np.array(embedding1)
+    embedding2 = np.array(embedding2)
+    return np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
+def process_image(image_input):
+    """Process image input (URL, uploaded file, or base64)"""
+    if isinstance(image_input, str):
+        if image_input.startswith("http"):
+            # URL
+            response = requests.get(image_input)
+            image = Image.open(BytesIO(response.content)).convert("RGB")
+        elif image_input.startswith("data:image"):
+            # Base64
+            image_data = image_input.split(",")[1]
+            image = Image.open(BytesIO(base64.b64decode(image_data))).convert("RGB")
+        else:
+            # Local path
+            image = Image.open(image_input).convert("RGB")
+    else:
+        # Uploaded file from Gradio
+        image = Image.open(image_input).convert("RGB")
+    return image
+def generate_embeddings(inputs, task="retrieval", input_type="text"):
+    """Generate embeddings for text or image inputs"""
+    try:
+        # Handle different input types
+        if input_type == "text":
+            features = model_handler.model.tokenize(inputs)
+        else:  # image
+            processed_images = [process_image(img) for img in inputs]
+            features = model_handler.model.tokenize(processed_images)
+        # Process features through model
+        with torch.no_grad():
+            outputs = model_handler.model.forward(features, task=task)
+            embeddings = outputs.get("sentence_embedding", None)
+            if embeddings is not None:
+                return embeddings.cpu().numpy().tolist()
+            else:
+                return None
+    except Exception as e:
+        return {"error": str(e)}
+def text_to_embedding(text, task="retrieval"):
+    """Convert text to embedding"""
+    if not text.strip():
+        return None
+    return generate_embeddings([text], task=task, input_type="text")[0]
+def image_to_embedding(image, task="retrieval"):
+    """Convert image to embedding"""
+    if image is None:
+        return None
+    return generate_embeddings([image], task=task, input_type="image")[0]
+def compare_embeddings(embedding1, embedding2):
+    """Compare two embeddings and return similarity"""
+    if embedding1 is None or embedding2 is None:
+        return "Please generate both embeddings first"
+    similarity = cosine_similarity(embedding1, embedding2)
+    return f"Cosine Similarity: {similarity:.4f}"
+# Create Gradio interface
+with gr.Blocks(title="Embedding Model Demo") as demo:
+    gr.Markdown("# Embedding Model Demo")
+    gr.Markdown("Generate and compare embeddings for text and images")
+    with gr.Tab("Text Embeddings"):
+        with gr.Row():
+            with gr.Column():
+                text_input1 = gr.Textbox(label="Text Input 1", lines=5)
+                task_dropdown1 = gr.Dropdown(
+                    ["retrieval", "text-matching", "code"],
+                    label="Task",
+                    value="retrieval"
+                )
+                text_embed_btn1 = gr.Button("Generate Embedding 1")
+            with gr.Column():
+                text_input2 = gr.Textbox(label="Text Input 2", lines=5)
+                task_dropdown2 = gr.Dropdown(
+                    ["retrieval", "text-matching", "code"],
+                    label="Task",
+                    value="retrieval"
+                )
+                text_embed_btn2 = gr.Button("Generate Embedding 2")
+        embedding_output1 = gr.JSON(label="Embedding 1", visible=False)
+        embedding_output2 = gr.JSON(label="Embedding 2", visible=False)
+        compare_btn = gr.Button("Compare Embeddings")
+        similarity_output = gr.Textbox(label="Similarity Result")
+    with gr.Tab("Image Embeddings"):
+        with gr.Row():
+            with gr.Column():
+                image_input1 = gr.Image(label="Image Input 1", type="pil")
+                image_task_dropdown1 = gr.Dropdown(
+                    ["retrieval"],
+                    label="Task",
+                    value="retrieval"
+                )
+                image_embed_btn1 = gr.Button("Generate Embedding 1")
+            with gr.Column():
+                image_input2 = gr.Image(label="Image Input 2", type="pil")
+                image_task_dropdown2 = gr.Dropdown(
+                    ["retrieval"],
+                    label="Task",
+                    value="retrieval"
+                )
+                image_embed_btn2 = gr.Button("Generate Embedding 2")
+        image_embedding_output1 = gr.JSON(label="Embedding 1", visible=False)
+        image_embedding_output2 = gr.JSON(label="Embedding 2", visible=False)
+        image_compare_btn = gr.Button("Compare Embeddings")
+        image_similarity_output = gr.Textbox(label="Similarity Result")
+    with gr.Tab("Cross-Modal Comparison"):
+        with gr.Row():
+            with gr.Column():
+                cross_text_input = gr.Textbox(label="Text Input", lines=5)
+                cross_text_task = gr.Dropdown(
+                    ["retrieval"],
+                    label="Task",
+                    value="retrieval"
+                )
+                cross_text_btn = gr.Button("Generate Text Embedding")
+            with gr.Column():
+                cross_image_input = gr.Image(label="Image Input", type="pil")
+                cross_image_task = gr.Dropdown(
+                    ["retrieval"],
+                    label="Task",
+                    value="retrieval"
+                )
+                cross_image_btn = gr.Button("Generate Image Embedding")
+        cross_text_embedding = gr.JSON(label="Text Embedding", visible=False)
+        cross_image_embedding = gr.JSON(label="Image Embedding", visible=False)
+        cross_compare_btn = gr.Button("Compare Text and Image")
+        cross_similarity_output = gr.Textbox(label="Similarity Result")
+    # Text tab events
+    text_embed_btn1.click(
+        fn=text_to_embedding,
+        inputs=[text_input1, task_dropdown1],
+        outputs=embedding_output1
+    )
+    text_embed_btn2.click(
+        fn=text_to_embedding,
+        inputs=[text_input2, task_dropdown2],
+        outputs=embedding_output2
+    )
+    compare_btn.click(
+        fn=compare_embeddings,
+        inputs=[embedding_output1, embedding_output2],
+        outputs=similarity_output
+    )
+    # Image tab events
+    image_embed_btn1.click(
+        fn=image_to_embedding,
+        inputs=[image_input1, image_task_dropdown1],
+        outputs=image_embedding_output1
+    )
+    image_embed_btn2.click(
+        fn=image_to_embedding,
+        inputs=[image_input2, image_task_dropdown2],
+        outputs=image_embedding_output2
+    )
+    image_compare_btn.click(
+        fn=compare_embeddings,
+        inputs=[image_embedding_output1, image_embedding_output2],
+        outputs=image_similarity_output
+    )
+    # Cross-modal tab events
+    cross_text_btn.click(
+        fn=text_to_embedding,
+        inputs=[cross_text_input, cross_text_task],
+        outputs=cross_text_embedding
+    )
+    cross_image_btn.click(
+        fn=image_to_embedding,
+        inputs=[cross_image_input, cross_image_task],
+        outputs=cross_image_embedding
+    )
+    cross_compare_btn.click(
+        fn=compare_embeddings,
+        inputs=[cross_text_embedding, cross_image_embedding],
+        outputs=cross_similarity_output
+    )
+# Launch the Gradio app
+if __name__ == "__main__":
+    demo.launch()

custom_st.py CHANGED Viewed

@@ -1,186 +1,229 @@
 import json
 import os
 from io import BytesIO
-from pathlib import Path
-from typing import Any, Dict, List, Literal, Optional, Union
-import requests
 import torch
-from PIL import Image
 from torch import nn
-from transformers import AutoConfig, AutoModel, AutoProcessor
 class Transformer(nn.Module):
     save_in_root: bool = True
     def __init__(
         self,
-        model_name_or_path: str = "jinaai/jina-embeddings-v4",
-        max_seq_length: Optional[int] = None,
-        config_args: Optional[Dict[str, Any]] = None,
-        model_args: Optional[Dict[str, Any]] = None,
-        tokenizer_args: Optional[Dict[str, Any]] = None,
-        cache_dir: Optional[str] = None,
-        backend: Literal["torch", "onnx", "openvino"] = "torch",
         **kwargs,
     ) -> None:
-        super(Transformer, self).__init__()
-        if backend != "torch":
-            raise ValueError(
-                f"Backend '{backend}' is not supported, please use 'torch' instead"
             )
-        config_kwargs = config_args or {}
-        model_kwargs = model_args or {}
-        tokenizer_kwargs = tokenizer_args or {}
-        self.config = AutoConfig.from_pretrained(
-            model_name_or_path, cache_dir=cache_dir, trust_remote_code=True, **config_kwargs
-        )
-        self.default_task = model_args.pop("default_task", None)
-        if self.default_task and self.default_task not in self.config.task_names:
             raise ValueError(
-                f"Invalid task: {self.default_task}. Must be one of {self.config.task_names}."
             )
-        self.model = AutoModel.from_pretrained(
-            model_name_or_path, config=self.config, cache_dir=cache_dir, trust_remote_code=True, **model_kwargs
-        )
-        self.processor = AutoProcessor.from_pretrained(
-            model_name_or_path,
             cache_dir=cache_dir,
-            use_fast=True,
-            trust_remote_code=True,
-            **tokenizer_kwargs,
         )
-        self.max_seq_length = max_seq_length or 8192
-    def tokenize(
-        self, texts: List[Union[str, Image.Image]], padding: Union[str, bool] = True
     ) -> Dict[str, torch.Tensor]:
-        encoding = {}
-        text_indices = []
-        image_indices = []
-        for i, text in enumerate(texts):
-            if isinstance(text, str):
-                # Remove Query: or Passage: prefixes when checking for URLs or file paths
-                clean_text = text
-                if text.startswith("Query: "):
-                    clean_text = text[len("Query: ") :]
-                elif text.startswith("Passage: "):
-                    clean_text = text[len("Passage: ") :]
-                if clean_text.startswith("http"):
-                    response = requests.get(clean_text)
-                    texts[i] = Image.open(BytesIO(response.content)).convert("RGB")
-                    image_indices.append(i)
-                else:
-                    try:
-                        if Path(clean_text).is_file():
-                            texts[i] = Image.open(clean_text).convert("RGB")
-                            image_indices.append(i)
-                        else:
-                            text_indices.append(i)
-                    except Exception as e:
-                        text_indices.append(i)
-            elif isinstance(text, Image.Image):
-                image_indices.append(i)
-            else:
-                raise ValueError(f"Invalid input type: {type(text)}")
-        if text_indices:
-            _texts = [texts[i] for i in text_indices]
-            text_features = self.processor.process_texts(
-                _texts, max_length=self.max_seq_length
             )
-            for key, value in text_features.items():
-                encoding[f"text_{key}"] = value
-            encoding["text_indices"] = text_indices
-        if image_indices:
-            _images = [texts[i] for i in image_indices]
-            img_features = self.processor.process_images(_images)
-            for key, value in img_features.items():
-                encoding[f"image_{key}"] = value
-            encoding["image_indices"] = image_indices
-        return encoding
-    def forward(
         self,
-        features: Dict[str, torch.Tensor],
-        task: Optional[str] = None,
-        truncate_dim: Optional[int] = None,
     ) -> Dict[str, torch.Tensor]:
-        self.model.eval()
-        if task is None:
-            if self.default_task is None:
-                raise ValueError(
-                    "Task must be specified before encoding data. You can set it either during "
-                    "loading the model (e.g., model_kwargs={'default_task': 'retrieval'}) or "
-                    "pass it as an argument to the encode method (e.g., model.encode(texts, task='retrieval'))."
-                )
-            task = self.default_task
         else:
-            if task not in self.config.task_names:
-                raise ValueError(
-                    f"Invalid task: {task}. Must be one of {self.config.task_names}."
-                )
-        device = self.model.device.type
-        all_embeddings = []
-        with torch.no_grad():
-            if any(k.startswith("text_") for k in features.keys()):
-                text_batch = {
-                    k[len("text_") :]: v.to(device)
-                    for k, v in features.items()
-                    if k.startswith("text_") and k != "text_indices"
-                }
-                text_indices = features.get("text_indices", [])
-                with torch.autocast(device_type=device, dtype=torch.bfloat16):
-                    text_embeddings = self.model(
-                        **text_batch, task_label=task
-                    ).single_vec_emb
-                    if truncate_dim:
-                        text_embeddings = text_embeddings[:, :truncate_dim]
-                        text_embeddings = torch.nn.functional.normalize(
-                            text_embeddings, p=2, dim=-1
-                        )
-                for i, embedding in enumerate(text_embeddings):
-                    all_embeddings.append((text_indices[i], embedding))
-            if any(k.startswith("image_") for k in features.keys()):
-                image_batch = {
-                    k[len("image_") :]: v.to(device)
-                    for k, v in features.items()
-                    if k.startswith("image_") and k != "image_indices"
-                }
-                image_indices = features.get("image_indices", [])
-                with torch.autocast(device_type=device, dtype=torch.bfloat16):
-                    img_embeddings = self.model(
-                        **image_batch, task_label=task
-                    ).single_vec_emb
-                    if truncate_dim:
-                        img_embeddings = img_embeddings[:, :truncate_dim]
-                        img_embeddings = torch.nn.functional.normalize(
-                            img_embeddings, p=2, dim=-1
-                        )
-                for i, embedding in enumerate(img_embeddings):
-                    all_embeddings.append((image_indices[i], embedding))
-        if not all_embeddings:
-            raise RuntimeError("No embeddings were generated")
-        all_embeddings.sort(key=lambda x: x[0])  # sort by original index
-        combined_embeddings = torch.stack([emb for _, emb in all_embeddings])
-        features["sentence_embedding"] = combined_embeddings
-        return features
     @classmethod
     def load(cls, input_path: str) -> "Transformer":
-        return cls(model_name_or_path=input_path)

 import json
+import logging
 import os
 from io import BytesIO
+from typing import Any, Dict, List, Optional, Tuple, Union
 import torch
 from torch import nn
+from transformers import AutoConfig, AutoModel, AutoTokenizer
+logger = logging.getLogger(__name__)
 class Transformer(nn.Module):
+    """Huggingface AutoModel to generate token embeddings.
+    Loads the correct class, e.g. BERT / RoBERTa etc.
+    Args:
+        model_name_or_path: Huggingface models name
+            (https://huggingface.co/models)
+        max_seq_length: Truncate any inputs longer than max_seq_length
+        model_args: Keyword arguments passed to the Huggingface
+            Transformers model
+        tokenizer_args: Keyword arguments passed to the Huggingface
+            Transformers tokenizer
+        config_args: Keyword arguments passed to the Huggingface
+            Transformers config
+        cache_dir: Cache dir for Huggingface Transformers to store/load
+            models
+        do_lower_case: If true, lowercases the input (independent if the
+            model is cased or not)
+        tokenizer_name_or_path: Name or path of the tokenizer. When
+            None, then model_name_or_path is used
+    """
     save_in_root: bool = True
     def __init__(
         self,
+        model_name_or_path: str,
+        max_seq_length: int = None,
+        model_args: Dict[str, Any] = None,
+        tokenizer_args: Dict[str, Any] = None,
+        config_args: Dict[str, Any] = None,
+        cache_dir: str = None,
+        do_lower_case: bool = False,
+        tokenizer_name_or_path: str = None,
         **kwargs,
     ) -> None:
+        super().__init__()
+        self.config_keys = ["max_seq_length", "do_lower_case"]
+        self.do_lower_case = do_lower_case
+        if model_args is None:
+            model_args = {}
+        if tokenizer_args is None:
+            tokenizer_args = {}
+        if config_args is None:
+            config_args = {}
+        if kwargs.get("backend", "torch") != "torch":
+            logger.warning(
+                f'"jinaai/jina-embeddings-v3" is currently not compatible with the {kwargs["backend"]} backend. '
+                'Continuing with the "torch" backend.'
             )
+        self.config = AutoConfig.from_pretrained(model_name_or_path, **config_args, cache_dir=cache_dir)
+        self._lora_adaptations = self.config.lora_adaptations
+        if (
+            not isinstance(self._lora_adaptations, list)
+            or len(self._lora_adaptations) < 1
+        ):
             raise ValueError(
+                f"`lora_adaptations` must be a list and contain at least one element"
             )
+        self._adaptation_map = {
+            name: idx for idx, name in enumerate(self._lora_adaptations)
+        }
+        self.default_task = model_args.pop('default_task', None)
+        self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=self.config, cache_dir=cache_dir, **model_args)
+        if max_seq_length is not None and "model_max_length" not in tokenizer_args:
+            tokenizer_args["model_max_length"] = max_seq_length
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name_or_path if tokenizer_name_or_path is not None else model_name_or_path,
             cache_dir=cache_dir,
+            **tokenizer_args,
         )
+        # No max_seq_length set. Try to infer from model
+        if max_seq_length is None:
+            if (
+                hasattr(self.auto_model, "config")
+                and hasattr(self.auto_model.config, "max_position_embeddings")
+                and hasattr(self.tokenizer, "model_max_length")
+            ):
+                max_seq_length = min(self.auto_model.config.max_position_embeddings, self.tokenizer.model_max_length)
+        self.max_seq_length = max_seq_length
+        if tokenizer_name_or_path is not None:
+            self.auto_model.config.tokenizer_class = self.tokenizer.__class__.__name__
+    @property
+    def default_task(self):
+        return self._default_task
+    @default_task.setter
+    def default_task(self, task: Union[None, str]):
+        self._validate_task(task)
+        self._default_task = task
+    def _validate_task(self, task: str):
+        if task and task not in self._lora_adaptations:
+            raise ValueError(
+                f"Unsupported task '{task}'. "
+                f"Supported tasks are: {', '.join(self.config.lora_adaptations)}. "
+                f"Alternatively, don't pass the `task` argument to disable LoRA."
+            )
+    def forward(
+        self, features: Dict[str, torch.Tensor], task: Optional[str] = None
     ) -> Dict[str, torch.Tensor]:
+        """Returns token_embeddings, cls_token"""
+        self._validate_task(task)
+        task = task or self.default_task
+        adapter_mask = None
+        if task:
+            task_id = self._adaptation_map[task]
+            num_examples = features['input_ids'].size(0)
+            adapter_mask = torch.full(
+                (num_examples,), task_id, dtype=torch.int32, device=features['input_ids'].device
             )
+        lora_arguments = (
+            {"adapter_mask": adapter_mask} if adapter_mask is not None else {}
+        )
+        features.pop('prompt_length', None)
+        output_states = self.auto_model.forward(**features, **lora_arguments, return_dict=False)
+        output_tokens = output_states[0]
+        features.update({"token_embeddings": output_tokens, "attention_mask": features["attention_mask"]})
+        return features
+    def get_word_embedding_dimension(self) -> int:
+        return self.auto_model.config.hidden_size
+    def tokenize(
         self,
+        texts: Union[List[str], List[dict], List[Tuple[str, str]]],
+        padding: Union[str, bool] = True
     ) -> Dict[str, torch.Tensor]:
+        """Tokenizes a text and maps tokens to token-ids"""
+        output = {}
+        if isinstance(texts[0], str):
+            to_tokenize = [texts]
+        elif isinstance(texts[0], dict):
+            to_tokenize = []
+            output["text_keys"] = []
+            for lookup in texts:
+                text_key, text = next(iter(lookup.items()))
+                to_tokenize.append(text)
+                output["text_keys"].append(text_key)
+            to_tokenize = [to_tokenize]
         else:
+            batch1, batch2 = [], []
+            for text_tuple in texts:
+                batch1.append(text_tuple[0])
+                batch2.append(text_tuple[1])
+            to_tokenize = [batch1, batch2]
+        # strip
+        to_tokenize = [[str(s).strip() for s in col] for col in to_tokenize]
+        # Lowercase
+        if self.do_lower_case:
+            to_tokenize = [[s.lower() for s in col] for col in to_tokenize]
+        output.update(
+            self.tokenizer(
+                *to_tokenize,
+                padding=padding,
+                truncation="longest_first",
+                return_tensors="pt",
+                max_length=self.max_seq_length,
+            )
+        )
+        return output
+    def get_config_dict(self) -> Dict[str, Any]:
+         return {key: self.__dict__[key] for key in self.config_keys}
+    def save(self, output_path: str, safe_serialization: bool = True) -> None:
+        self.auto_model.save_pretrained(output_path, safe_serialization=safe_serialization)
+        self.tokenizer.save_pretrained(output_path)
+        with open(os.path.join(output_path, "sentence_bert_config.json"), "w") as fOut:
+            json.dump(self.get_config_dict(), fOut, indent=2)
     @classmethod
     def load(cls, input_path: str) -> "Transformer":
+        # Old classes used other config names than 'sentence_bert_config.json'
+        for config_name in [
+            "sentence_bert_config.json",
+            "sentence_roberta_config.json",
+            "sentence_distilbert_config.json",
+            "sentence_camembert_config.json",
+            "sentence_albert_config.json",
+            "sentence_xlm-roberta_config.json",
+            "sentence_xlnet_config.json",
+        ]:
+            sbert_config_path = os.path.join(input_path, config_name)
+            if os.path.exists(sbert_config_path):
+                break
+        with open(sbert_config_path) as fIn:
+            config = json.load(fIn)
+        # Don't allow configs to set trust_remote_code
+        if "model_args" in config and "trust_remote_code" in config["model_args"]:
+            config["model_args"].pop("trust_remote_code")
+        if "tokenizer_args" in config and "trust_remote_code" in config["tokenizer_args"]:
+            config["tokenizer_args"].pop("trust_remote_code")
+        if "config_args" in config and "trust_remote_code" in config["config_args"]:
+            config["config_args"].pop("trust_remote_code")
+        return cls(model_name_or_path=input_path, **config)

inference.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import os
+import torch
+import json
+from typing import Dict, List, Union, Optional, Any
+from PIL import Image
+from transformers import AutoConfig, AutoTokenizer
+from custom_st import Transformer
+class InferenceEmbeddings:
+    def __init__(self, model_path: str):
+        """
+        Initialize the embedding model for inference
+        Args:
+            model_path: Path to the model directory
+        """
+        self.model_path = model_path
+        self.model = Transformer(
+            model_name_or_path=model_path,
+            model_args={"default_task": "retrieval", "trust_remote_code": True},
+            trust_remote_code=True
+        )
+        self.model.eval()
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model.to(self.device)
+    def encode_text(self,
+                   texts: List[str],
+                   task: str = "retrieval",
+                   prompt_name: Optional[str] = None,
+                   truncate_dim: Optional[int] = None,
+                   return_multivector: bool = False,
+                   max_length: Optional[int] = None,
+                   batch_size: int = 32) -> torch.Tensor:
+        """
+        Encode text inputs to embeddings
+        Args:
+            texts: List of text inputs to encode
+            task: Task for which to generate embeddings (retrieval, text-matching, code)
+            prompt_name: Optional prompt type (query, passage)
+            truncate_dim: Optional dimension to truncate embeddings to
+            return_multivector: Whether to return multi-vector embeddings
+            max_length: Maximum token length
+            batch_size: Batch size for encoding
+        Returns:
+            Tensor of embeddings
+        """
+        if prompt_name:
+            # Add prompt prefix based on prompt_name
+            if prompt_name == "query":
+                texts = [f"Query: {text}" for text in texts]
+            elif prompt_name == "passage":
+                texts = [f"Passage: {text}" for text in texts]
+        embeddings = []
+        for i in range(0, len(texts), batch_size):
+            batch_texts = texts[i:i+batch_size]
+            features = self.model.tokenize(batch_texts)
+            # Move tensors to device
+            for key, value in features.items():
+                if isinstance(value, torch.Tensor):
+                    features[key] = value.to(self.device)
+            with torch.no_grad():
+                outputs = self.model.forward(features, task=task, truncate_dim=truncate_dim)
+                batch_embeddings = outputs.get("sentence_embedding", None)
+                if batch_embeddings is not None:
+                    embeddings.append(batch_embeddings.cpu())
+        if embeddings:
+            return torch.cat(embeddings, dim=0)
+        else:
+            raise RuntimeError("Failed to generate embeddings")
+    def encode_image(self,
+                    images: List[Union[str, Image.Image]],
+                    task: str = "retrieval",
+                    truncate_dim: Optional[int] = None,
+                    return_multivector: bool = False,
+                    max_pixels: Optional[int] = None,
+                    batch_size: int = 8) -> torch.Tensor:
+        """
+        Encode image inputs to embeddings
+        Args:
+            images: List of image inputs (file paths, URLs, or PIL Images)
+            task: Task for which to generate embeddings
+            truncate_dim: Optional dimension to truncate embeddings to
+            return_multivector: Whether to return multi-vector embeddings
+            max_pixels: Maximum number of pixels for image resizing
+            batch_size: Batch size for encoding
+        Returns:
+            Tensor of embeddings
+        """
+        embeddings = []
+        for i in range(0, len(images), batch_size):
+            batch_images = images[i:i+batch_size]
+            features = self.model.tokenize(batch_images)
+            # Move tensors to device
+            for key, value in features.items():
+                if isinstance(value, torch.Tensor):
+                    features[key] = value.to(self.device)
+            with torch.no_grad():
+                outputs = self.model.forward(features, task=task, truncate_dim=truncate_dim)
+                batch_embeddings = outputs.get("sentence_embedding", None)
+                if batch_embeddings is not None:
+                    embeddings.append(batch_embeddings.cpu())
+        if embeddings:
+            return torch.cat(embeddings, dim=0)
+        else:
+            raise RuntimeError("Failed to generate embeddings")
+def load_model(model_path: str):
+    """
+    Load the embedding model for inference
+    Args:
+        model_path: Path to the model directory
+    Returns:
+        Loaded model instance
+    """
+    return InferenceEmbeddings(model_path)

model_card.md ADDED Viewed

	@@ -0,0 +1,85 @@

+---
+language: multilingual
+license: other
+datasets:
+  - jinaai/jina-vdr
+pipeline_tag: feature-extraction
+tags:
+  - embeddings
+  - multilingual-embeddings
+  - multimodal-embeddings
+  - text-to-image
+  - sentence-transformers
+  - sentence-similarity
+  - visual-document-retrieval
+---
+# Custom Embedding Model
+This is a custom embedding model based on the Jina Embeddings V4 architecture, specially adapted for embedding tasks involving text, images, and visual documents.
+## Model Description
+The model supports:
+- **Multimodal Embeddings**: Generate unified embeddings for text and images
+- **Multilingual Support**: Works across 30+ languages
+- **Task-specific Modes**: Optimized for retrieval, text-matching, and code tasks
+- **Flexible Dimensions**: Dense embeddings that can be truncated with minimal performance loss
+## Usage
+### Text Embeddings
+```python
+from custom_st import Transformer
+# Initialize the model
+model = Transformer(
+    model_name_or_path="path/to/model",
+    model_args={"default_task": "retrieval", "trust_remote_code": True},
+    trust_remote_code=True
+)
+# Encode text
+texts = ["Your text here", "Another text example"]
+features = model.tokenize(texts)
+outputs = model.forward(features, task="retrieval")
+embeddings = outputs["sentence_embedding"]
+```
+### Image Embeddings
+```python
+from PIL import Image
+from custom_st import Transformer
+# Initialize the model
+model = Transformer(
+    model_name_or_path="path/to/model",
+    model_args={"default_task": "retrieval", "trust_remote_code": True},
+    trust_remote_code=True
+)
+# Load images
+images = [Image.open("image1.jpg"), Image.open("image2.jpg")]
+# Or use URLs
+image_urls = ["http://example.com/image1.jpg", "http://example.com/image2.jpg"]
+# Encode images
+features = model.tokenize(images)  # or model.tokenize(image_urls)
+outputs = model.forward(features, task="retrieval")
+embeddings = outputs["sentence_embedding"]
+```
+## Requirements
+- Python 3.8+
+- PyTorch 2.0+
+- Transformers 4.30+
+- PEFT 0.4+
+- Pillow 9.0+
+## License
+This model is available under the same terms as the original model it's based on. Please refer to the license information in the repository for details.

pipeline.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from typing import List, Union, Dict, Any, Optional
+import torch
+import numpy as np
+from PIL import Image
+from transformers import Pipeline
+from custom_st import Transformer
+class EmbeddingPipeline(Pipeline):
+    """
+    Pipeline for generating embeddings using custom transformer model
+    """
+    def __init__(self, model, **kwargs):
+        super().__init__(model=model, **kwargs)
+        # Default task if not specified
+        self.default_task = "retrieval"
+    def _sanitize_parameters(self, task=None, truncate_dim=None, **kwargs):
+        preprocess_params = {}
+        forward_params = {}
+        postprocess_params = {}
+        if task is not None:
+            forward_params["task"] = task
+        if truncate_dim is not None:
+            forward_params["truncate_dim"] = truncate_dim
+        return preprocess_params, forward_params, postprocess_params
+    def preprocess(self, inputs, **preprocess_params):
+        """
+        Preprocess the inputs before passing to model
+        """
+        # Handle single input vs list of inputs
+        if not isinstance(inputs, list):
+            inputs = [inputs]
+        # Tokenize/prepare the inputs
+        features = self.model.tokenize(inputs)
+        return features
+    def _forward(self, features, task=None, truncate_dim=None):
+        """
+        Forward pass through the model
+        """
+        # Set default task if not provided
+        if task is None:
+            task = self.default_task
+        # Forward pass
+        outputs = self.model.forward(features, task=task, truncate_dim=truncate_dim)
+        return outputs
+    def postprocess(self, model_outputs, **postprocess_params):
+        """
+        Convert model outputs to final embeddings
+        """
+        # Extract embeddings
+        embeddings = model_outputs.get("sentence_embedding", None)
+        if embeddings is None:
+            raise ValueError("No embeddings were generated")
+        # Convert to numpy
+        embeddings = embeddings.cpu().numpy()
+        return embeddings
+def load_pipeline(model_path: str, device: str = None):
+    """
+    Load the embedding pipeline from a model path
+    Args:
+        model_path: Path to the model directory
+        device: Device to use for inference (cpu, cuda, etc.)
+    Returns:
+        EmbeddingPipeline instance
+    """
+    # Determine device
+    if device is None:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Load model
+    model = Transformer(
+        model_name_or_path=model_path,
+        model_args={"default_task": "retrieval", "trust_remote_code": True},
+        trust_remote_code=True
+    )
+    model.to(device)
+    model.eval()
+    # Create pipeline
+    pipeline = EmbeddingPipeline(model=model, device=device)
+    return pipeline

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+torch>=2.0.0
+transformers>=4.30.2
+peft>=0.4.0
+pillow>=9.0.0
+numpy>=1.22.0
+sentencepiece>=0.1.97
+protobuf>=3.20.0
+accelerate>=0.20.0
+gradio>=3.50.0
+requests>=2.28.0
+torchvision>=0.15.0
+fastapi>=0.95.0
+uvicorn>=0.22.0
+pydantic>=1.10.0