feat(vector): Automatic indexing of documents in s3 storage

2026-01-30 22:38:08 +01:00
commit e60eb297fe
15 changed files with 1026 additions and 0 deletions
--- a/vector/init.py
+++ b/vector/init.py
@@ -0,0 +1,5 @@
+from .chunk import Chunks, Chunk
+from .client import EmbeddedClient
+from .qdrant import Qdrant
+
+__all__ = ["Chunks", "Chunk", "EmbeddedClient", "Qdrant"]
--- a/vector/chunk.py
+++ b/vector/chunk.py
@@ -0,0 +1,41 @@
+from typing import Optional
+from .client import EmbeddedClient
+
+class Chunk:
+    def __init__(self, id: int, text: str):
+        self.id = id
+        self.text = text
+        self.size = len(text)
+        self.__embedding = None
+    
+    def embed(self, client: EmbeddedClient) -> list[float]:
+        self.__embedding = client.embed(self.text)
+        return self.__embedding
+    
+    @property
+    def embedding(self) -> Optional[list[float]]:
+        return self.__embedding
+    
+    @property
+    def has_embedding(self) -> bool:
+        return self.__embedding is not None
+
+class Chunks:
+    def __init__(self, size: int = 5000, overlap: int = 200) -> None:
+        self.size = size
+        self.overlap = overlap
+        self.__count = 0
+
+    def chunk(self, text: str) -> list[Chunk]:
+        chunks = []
+        i = 0
+        while i < len(text):
+            chunk_id = len(chunks) + 1
+            chunks.append(Chunk(id=chunk_id, text=text[i:i+self.size]))
+            i += self.size - self.overlap
+        self.__count = len(chunks)
+        return chunks
+    
+    @property
+    def count(self) -> int:
+        return self.__count
--- a/vector/client.py
+++ b/vector/client.py
@@ -0,0 +1,19 @@
+import requests
+
+class EmbeddedClient:
+    def __init__(self, authorization: str, url: str = "https://openrouter.ai/api/v1/embeddings", model: str = "qwen/qwen3-embedding-8b") -> None:
+        self.url = url
+        self.authorization = authorization
+        self.model = model
+        
+    def embed(self, text: str) -> list[float]:
+        print(f"Embedding text {text}...")
+        response = requests.post(
+            f"{self.url}",
+            headers={"Authorization": self.authorization, "Content-Type": "application/json"},
+            json={"input": text, "model": self.model},
+            timeout=60
+        )
+        response.raise_for_status()
+        embedding = response.json()["data"][0]["embedding"]
+        return embedding
--- a/vector/qdrant.py
+++ b/vector/qdrant.py
@@ -0,0 +1,57 @@
+from qdrant_client import QdrantClient
+from qdrant_client.models import PointStruct, VectorParams, Distance
+
+from .chunk import Chunk
+
+class Qdrant:
+    def __init__(self, host: str, port: int, collection_name: str   ) -> None:
+        self.client = QdrantClient(host=host, port=port)
+        self.collection_name = collection_name
+        
+    def create_collection(self) -> None:
+        self.client.create_collection(
+            collection_name=self.collection_name,
+            vectors_config=VectorParams(
+                size=4096,
+                distance=Distance.COSINE,
+            ),
+        )
+        
+    def create_if_not_exists_collection(self) -> None:
+        if not self.client.collection_exists(collection_name=self.collection_name):
+            self.create_collection()
+        
+    def delete_collection(self) -> None:
+        self.client.delete_collection(collection_name=self.collection_name)
+        
+    def create_points(self, chunks: list[Chunk], bucket: str, object_name: str) -> list[PointStruct]:
+        points = []
+        for chunk in chunks:
+            points.append(self.create_point(chunk, bucket, object_name))
+        return points
+        
+    def create_point(self, chunk: Chunk, bucket: str, object_name: str) -> PointStruct:
+        if not chunk.has_embedding or chunk.embedding is None:
+            raise ValueError("Chunk has no embedding")
+        embedding: list[float] = chunk.embedding
+        point = PointStruct(
+            id=chunk.id,
+            vector=embedding,
+            payload={
+                "text": chunk.text,
+                "bucket": bucket,
+                "object": object_name,
+                "id": chunk.id,
+                "chunk_size": chunk.size
+            }
+        )
+        return point
+    
+    def upsert_points(self, points: list[PointStruct], batch_size: int = 50) -> None:
+        num_batches = (len(points) + batch_size - 1) // batch_size
+        print(f"Upserting {len(points)} points in {num_batches} batches...")
+        for batch_start in range(0, len(points), batch_size):
+            batch = points[batch_start:batch_start + batch_size]
+            self.client.upsert(collection_name=self.collection_name, points=batch)
+            print(f"Upserted {len(batch)} points...")
+        print(f"Upserted {len(points)} points!")