feat(vector): Automatic indexing of documents in s3 storage

2026-01-30 22:38:08 +01:00
commit e60eb297fe
15 changed files with 1026 additions and 0 deletions
--- a/vector/chunk.py
+++ b/vector/chunk.py
@@ -0,0 +1,41 @@
+from typing import Optional
+from .client import EmbeddedClient
+
+class Chunk:
+    def __init__(self, id: int, text: str):
+        self.id = id
+        self.text = text
+        self.size = len(text)
+        self.__embedding = None
+    
+    def embed(self, client: EmbeddedClient) -> list[float]:
+        self.__embedding = client.embed(self.text)
+        return self.__embedding
+    
+    @property
+    def embedding(self) -> Optional[list[float]]:
+        return self.__embedding
+    
+    @property
+    def has_embedding(self) -> bool:
+        return self.__embedding is not None
+
+class Chunks:
+    def __init__(self, size: int = 5000, overlap: int = 200) -> None:
+        self.size = size
+        self.overlap = overlap
+        self.__count = 0
+
+    def chunk(self, text: str) -> list[Chunk]:
+        chunks = []
+        i = 0
+        while i < len(text):
+            chunk_id = len(chunks) + 1
+            chunks.append(Chunk(id=chunk_id, text=text[i:i+self.size]))
+            i += self.size - self.overlap
+        self.__count = len(chunks)
+        return chunks
+    
+    @property
+    def count(self) -> int:
+        return self.__count