In [1]:
import os
import hopsworks
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType
from docling.chunking import HybridChunker

os.environ["TOKENIZERS_PARALLELISM"] = "false"

 from .autonotebook import tqdm as notebook_tqdm


In [2]:
PDF_PATH = "content/Building+Machine+Learning+Systems+with+a+Feature+Store.pdf"
EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
EXPORT_TYPE = ExportType.DOC_CHUNKS

In [3]:
loader = DoclingLoader(
 file_path=PDF_PATH,
 export_type=EXPORT_TYPE,
 chunker=HybridChunker(tokenizer=EMBED_MODEL_ID),
)

docs = loader.load()
print(f"Loaded {len(docs)} document chunks")

2025-12-02 19:43:33,611 INFO: detected formats: []
2025-12-02 19:43:33,861 INFO: Going to convert document batch...
2025-12-02 19:43:33,863 INFO: Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7
2025-12-02 19:43:33,914 INFO: Loading plugin 'docling_defaults'
2025-12-02 19:43:33,926 INFO: Registered picture descriptions: ['vlm', 'api']
2025-12-02 19:43:33,982 INFO: Loading plugin 'docling_defaults'
2025-12-02 19:43:34,010 INFO: Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-12-02 19:43:42,281 INFO: Auto OCR model selected ocrmac.
2025-12-02 19:43:42,299 INFO: Loading plugin 'docling_defaults'
2025-12-02 19:43:42,323 INFO: Registered layout engines: ['docling_layout_default', 'docling_experimental_table_crops_layout']
2025-12-02 19:43:42,347 INFO: Accelerator device: 'mps'
2025-12-02 19:43:57,907 INFO: Loading plugin 'docling_defaults'
2025-12-02 19:43:57,919 INFO: Registered table structure

Token indices sequence length is longer than the specified maximum sequence length for this model (1143 > 512). Running this sequence through the model will result in indexing errors


Loaded 1333 document chunks


In [11]:
print(docs[1])

page_content='Praise for Building Machine Learning Systems with a Feature Store
It' s easy to be lost in quality metrics land and forget about the crucial systems aspect to ML. Jim does a great job explaining those aspects and gives a lot of practical tips on how to survive a long deployment.
-Hannes Mühleisen, cocreator of DuckDB
Building machine learning systems in production has historically involved a lot of black magic and undocumented learnings. Jim Dowling is doing a great service to ML practitioners by sharing the best practices and putting together clear step-by-step guide.' metadata={'source': 'content/Building+Machine+Learning+Systems+with+a+Feature+Store.pdf', 'dl_meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/7', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 97.75, 't': 162.01999999999998, 'r': 432.0, 'b': 126.0299999999

In [4]:
if EXPORT_TYPE == ExportType.DOC_CHUNKS:
 splits = docs
else:
 from langchain_text_splitters import MarkdownHeaderTextSplitter
 splitter = MarkdownHeaderTextSplitter(
 headers_to_split_on=[
 ("#", "Header_1"),
 ("##", "Header_2"),
 ("###", "Header_3"),
 ],
 )
 splits = [split for doc in docs for split in splitter.split_text(doc.page_content)]

print(f"Created {len(splits)} splits")
print(f"Sample: {splits[0].page_content[:100]}...")

Created 1333 splits
Sample: Praise for Building Machine Learning Systems with a Feature Store
I witnessed the rise of feature st...


In [5]:
embeddings = SentenceTransformer(EMBED_MODEL_ID)

2025-12-02 19:52:07,229 INFO: Use pytorch device_name: mps
2025-12-02 19:52:07,232 INFO: Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


In [6]:
texts = [split.page_content for split in splits]
metadatas = [split.metadata for split in splits]

vectors = embeddings.encode(texts, show_progress_bar=True, batch_size=32)
print(f"Created {len(vectors)} embeddings")

Batches: 100%|██████████| 42/42 [00:18<00:00, 2.31it/s]


Created 1333 embeddings


In [7]:
project = hopsworks.login()
fs = project.get_feature_store()

2025-12-02 19:52:44,050 INFO: Initializing external client
2025-12-02 19:52:44,064 INFO: Base URL: https://c.app.hopsworks.ai:443




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-12-02 19:52:47,302 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1271977


In [8]:
data = []
for i, (text, vector, metadata) in enumerate(zip(texts, vectors, metadatas)):
 data.append({
 'id': i,
 'text': text,
 'page': metadata.get('page', metadata.get('page_number', 0)),
 'embedding': vector
 })

df = pd.DataFrame(data)
print(f"Created dataframe with {len(df)} rows")

Created dataframe with 1333 rows


In [9]:
book_fg = fs.get_or_create_feature_group(
 name="book_embeddings",
 version=2,
 primary_key=["id"],
 description="Book text chunks with embeddings"
)

book_fg.insert(df)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1271977/fs/1258579/fg/1790385


Uploading Dataframe: 100.00% |██████████| Rows 1333/1333 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: book_embeddings_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1271977/jobs/named/book_embeddings_2_offline_fg_materialization/executions


(Job('book_embeddings_2_offline_fg_materialization', 'SPARK'), None)