Spaces:

Hawky-ai
/

feather-db

Sleeping

Feather Bench

Add Benchmarks tab + headline banner — LongMemEval 0.693 visible on landing

0f62ca9 about 1 month ago

23.9 kB

	"""
	Feather DB — Interactive Demo
	HuggingFace Space · feather-db v0.6.0

	Demonstrates:
	- Semantic search over a pre-loaded knowledge graph
	- Context chain (vector search + graph BFS expansion)
	- Graph health report
	- feather_why — retrieval score breakdown
	- Add new intel nodes live
	"""

	import hashlib
	import json
	import time

	import gradio as gr
	import numpy as np

	try:
	import feather_db
	_FEATHER_OK = True
	except ImportError:
	_FEATHER_OK = False


	# ── Offline embedder (no API key needed) ──────────────────────────────────────
	def _embed(text: str, dim: int = 768) -> np.ndarray:
	vec = np.zeros(dim, dtype=np.float32)
	tokens = text.lower().replace(",", " ").replace(".", " ").split()
	for tok in tokens:
	h = int(hashlib.md5(tok.encode()).hexdigest(), 16)
	for j in range(8):
	vec[(h >> (j * 5)) % dim] += 1.0 / (j + 1)
	norm = np.linalg.norm(vec)
	return (vec / norm) if norm > 0 else vec


	# ── Seed knowledge graph — AI developer tools / product intelligence ──────────
	#
	# Domain: a team building an AI-powered developer tool (editor, CLI, SDK, cloud)
	# tracks feature performance, competitor moves, community signals, and strategy.
	# All data points are realistic and meaningful for this domain.
	#
	SEED_NODES = [
	(
	1,
	"AI autocomplete in the editor: 68% daily active usage, avg 12 completions accepted per session. "
	"Highest adoption of any feature shipped this quarter. Strongest signal in power-user cohort.",
	"feature_performance", "Editor", 0.92,
	),
	(
	2,
	"Competitor launched inline AI debugging with natural-language error explanations. "
	"3,400 GitHub stars in 48 hours. Announcement dominated dev Twitter for two days. "
	"Directly targets our core editor user base.",
	"competitor_intel", "Editor", 0.95,
	),
	(
	3,
	"StackOverflow Developer Survey 2026: 71% of developers now use AI coding assistants daily, "
	"up from 44% last year. Willingness to pay for productivity tools at an all-time high. "
	"Enterprise segment growing fastest.",
	"market_signal", "SDK", 0.90,
	),
	(
	4,
	"CLI onboarding funnel: 34% of new users drop off at step 3 (API key setup). "
	"Median time-to-first-output is 4.2 minutes — well above our 90-second target. "
	"Friction is authentication, not comprehension.",
	"user_feedback", "CLI", 0.87,
	),
	(
	5,
	"SDK v2 launched with streaming and tool-use support. Download velocity 2.1x faster than SDK v1 "
	"in the first week. Community PRs opened within 6 hours of release. "
	"Streaming is the most-requested missing feature now resolved.",
	"feature_performance", "SDK", 0.89,
	),
	(
	6,
	"Strategy brief: reduce time-to-first-value under 90 seconds for all entry points. "
	"Frictionless auth (OAuth + token auto-detect) identified as the highest-leverage lever. "
	"Target: onboarding completion rate from 66% to 85% in Q2.",
	"strategy_brief", "CLI", 0.93,
	),
	(
	7,
	"Community Discord: offline / air-gapped mode has 47 upvotes and is the top feature request. "
	"Users cite enterprise security policy and data-residency requirements. "
	"Three Fortune 500 pilots blocked specifically by this gap.",
	"community_signal", "Cloud", 0.88,
	),
	(
	8,
	"VS Code extension outperforms JetBrains plugin 3.1x in weekly active users and 4.8x in session length. "
	"Recommend 70/30 investment split. JetBrains users skew toward Java/Kotlin — "
	"worth a targeted language-server improvement sprint.",
	"channel_insight", "Editor", 0.86,
	),
	(
	9,
	"Retention analysis: power users (5+ sessions/week) show 8.4x 90-day retention vs casual users. "
	"Habit formation — not feature breadth — is the primary retention driver. "
	"Users who complete 3 sessions in week 1 have 72% chance of being active at day 90.",
	"user_feedback", "SDK", 0.91,
	),
	(
	10,
	"Open-source alternative launched under MIT license: 12k GitHub stars in first month. "
	"No cloud sync, no team features, local-only. Actively targeting our free-tier users "
	"with 'no vendor lock-in' messaging. Poses risk to top-of-funnel acquisition.",
	"competitor_intel", "Cloud", 0.93,
	),
	]

	SEED_EDGES = [
	(2, 1, "contradicts", 0.90), # competitor launch threatens editor feature lead
	(3, 5, "supports", 0.85), # market survey supports SDK investment
	(4, 6, "references", 0.92), # onboarding drop-off directly informs strategy brief
	(6, 4, "derived_from", 0.88), # strategy brief derived from CLI feedback
	(8, 1, "supports", 0.78), # VS Code dominance supports editor focus
	(9, 6, "supports", 0.87), # retention data supports onboarding strategy
	(10, 7, "supports", 0.80), # OSS competitor validates offline mode demand
	(3, 1, "supports", 0.75), # rising AI adoption supports editor feature investment
	]

	DIM = 768
	_DB_PATH = "/tmp/feather_demo.feather"
	_db = None


	def _get_db():
	global _db
	if _db is not None:
	return _db
	if not _FEATHER_OK:
	return None

	db = feather_db.DB.open(_DB_PATH, dim=DIM)
	t0 = int(time.time()) - 7 * 86400 # seed nodes spread across last 7 days

	for nid, content, etype, product, imp in SEED_NODES:
	vec = _embed(content, DIM)
	meta = feather_db.Metadata()
	meta.timestamp = t0 + nid * 14400 # 4-hour intervals
	meta.importance = imp
	meta.confidence = 0.9
	meta.type = feather_db.ContextType.FACT
	meta.source = "demo_seed"
	meta.content = content
	meta.namespace_id = "devtools"
	meta.entity_id = etype
	meta.set_attribute("entity_type", etype)
	meta.set_attribute("product", product)
	db.add(id=nid, vec=vec, meta=meta)

	for src, tgt, rel, w in SEED_EDGES:
	db.link(src, tgt, rel, w)

	db.save()
	_db = db
	return db


	# ── Tool implementations ───────────────────────────────────────────────────────

	def do_search(query: str, k: int, product_filter: str) -> str:
	db = _get_db()
	if db is None:
	return "⚠️ feather_db not installed. Run: pip install feather-db"
	if not query.strip():
	return "Enter a query above."

	vec = _embed(query, DIM)
	results = db.search(vec, k=k * 3)

	rows = []
	for r in results:
	m = r.metadata
	p = m.get_attribute("product")
	if product_filter and product_filter != "All" and p != product_filter:
	continue
	rows.append({
	"id": r.id,
	"score": round(r.score, 4),
	"entity_type": m.get_attribute("entity_type"),
	"product": p,
	"content": m.content,
	"recall_count": m.recall_count,
	"importance": round(m.importance, 3),
	})
	if len(rows) >= k:
	break

	if not rows:
	return "No results found."
	return json.dumps(rows, indent=2)


	def do_context_chain(query: str, k: int, hops: int) -> str:
	db = _get_db()
	if db is None:
	return "⚠️ feather_db not installed."
	if not query.strip():
	return "Enter a query above."

	vec = _embed(query, DIM)
	chain = db.context_chain(vec, k=k, hops=hops, modality="text")

	nodes = []
	for node in sorted(chain.nodes, key=lambda n: (n.hop, -n.score)):
	m = node.metadata
	nodes.append({
	"id": node.id,
	"hop": node.hop,
	"score": round(node.score, 4),
	"entity_type": m.get_attribute("entity_type"),
	"product": m.get_attribute("product"),
	"content": m.content[:140] + ("…" if len(m.content) > 140 else ""),
	})

	edges = [
	{"source": e.source, "target": e.target,
	"rel_type": e.rel_type, "weight": round(e.weight, 3)}
	for e in chain.edges
	]

	return json.dumps({
	"summary": f"{len(nodes)} nodes reached across {hops} graph hop(s)",
	"nodes": nodes,
	"edges": edges,
	}, indent=2)


	def do_why(node_id: int, query: str) -> str:
	db = _get_db()
	if db is None:
	return "⚠️ feather_db not installed."
	if not query.strip():
	return "Enter a query above."

	from feather_db.memory import MemoryManager
	vec = _embed(query, DIM)
	result = MemoryManager.why_retrieved(db, node_id=int(node_id), query_vec=vec)
	return json.dumps(result, indent=2)


	def do_health() -> str:
	db = _get_db()
	if db is None:
	return "⚠️ feather_db not installed."

	from feather_db.memory import MemoryManager
	report = MemoryManager.health_report(db, modality="text")
	return json.dumps(report, indent=2)


	def do_add(content: str, entity_type: str, product: str, importance: float) -> str:
	db = _get_db()
	if db is None:
	return "⚠️ feather_db not installed."
	if not content.strip():
	return "Content cannot be empty."

	nid = int(time.time() * 1000) % (2 ** 32)
	vec = _embed(content, DIM)
	meta = feather_db.Metadata()
	meta.timestamp = int(time.time())
	meta.importance = float(importance)
	meta.confidence = 0.85
	meta.type = feather_db.ContextType.EVENT
	meta.source = "gradio_user"
	meta.content = content
	meta.namespace_id = "devtools"
	meta.entity_id = entity_type
	meta.set_attribute("entity_type", entity_type)
	meta.set_attribute("product", product)
	db.add(id=nid, vec=vec, meta=meta)
	db.save()
	return json.dumps({
	"status": "added",
	"id": nid,
	"entity_type": entity_type,
	"product": product,
	"tip": "Node is now live — try searching for it in the Search tab.",
	})


	# ── Preload on startup ────────────────────────────────────────────────────────
	_get_db()

	# ── Gradio UI ─────────────────────────────────────────────────────────────────
	with gr.Blocks(
	title="Feather DB — Living Context Engine",
	theme=gr.themes.Soft(),
	css=".tool-output { font-family: monospace; font-size: 0.84rem; }",
	) as demo:

	gr.HTML("""
	<div style="border-left:4px solid #6366f1;padding-left:1rem;margin-bottom:1rem">
	<h1 style="margin:0">🪶 Feather DB — Living Context Engine</h1>
	<p style="margin:0.4rem 0 0 0">
	Embedded vector DB · HNSW search · typed context graph · adaptive decay · MCP server<br/>
	<a href="https://www.getfeather.store/" target="_blank">getfeather.store</a> ·
	<a href="https://hawky.ai" target="_blank">Hawky.ai</a> ·
	<a href="https://pypi.org/project/feather-db/" target="_blank">PyPI</a> ·
	<a href="https://github.com/feather-store/feather" target="_blank">GitHub</a> ·
	<code>pip install feather-db</code>
	</p>
	</div>

	<div style="background:linear-gradient(90deg,#eef2ff,#fef3c7);border:1px solid #c7d2fe;border-radius:8px;padding:0.85rem 1rem;margin-bottom:1rem">
	<strong style="font-size:1.05rem">📊 Latest benchmark — LongMemEval (Apr 2026):</strong>
	<code style="background:white;padding:2px 6px;border-radius:4px">Feather + GPT-4o = 0.693</code> ·
	<code style="background:white;padding:2px 6px;border-radius:4px">Feather + Gemini-Flash = 0.657</code>
	<span style="color:#6b7280">— beats the LongMemEval paper's full-context GPT-4o ceiling (0.640).
	Reproducible on a $2.40 budget.</span>
	<a href="https://github.com/feather-store/feather/blob/master/docs/benchmarks/longmemeval.md"
	target="_blank" style="margin-left:0.4rem">Full report →</a>
	</div>
	""")

	gr.Markdown("""
	Demo graph: 10 nodes representing product intelligence for an AI developer tools team —
	feature performance, competitor moves, community signals, strategy briefs, and user research.
	8 typed causal edges connect them (`contradicts`, `supports`, `derived_from`, `references`).
	""")

	with gr.Tabs():

	# ── Benchmarks ────────────────────────────────────────────────────────
	with gr.TabItem("📊 Benchmarks"):
	gr.Markdown("""
	## Feather DB v0.8.0 — Reproducible Benchmark Results

	### LongMemEval (Xu et al., 2024 / ICLR 2025)

	500-question end-to-end memory QA benchmark. Each question carries up to ~115K
	tokens of chat history; system must ingest, retrieve, and answer correctly
	across 5 memory ability axes.

	\| System \| Variant \| Answerer \| Overall \| Cost / run \|
	\|---\|---\|---\|---\|---\|
	\| Feather DB v0.8.0 + decay \| S \| gpt-4o \| 0.693 \| ~$8 \|
	\| Feather DB v0.8.0 + decay \| S \| gemini-2.5-flash \| 0.657 \| ~$2.40 \|
	\| Full-context GPT-4o (paper "ceiling") \| S \| gpt-4o + CoN \| 0.640 \| n/a \|
	\| Zep (graphiti) \| S \| gpt-4o-mini \| 0.638 \| (vendor) \|
	\| Full-context GPT-4o-mini \| S \| gpt-4o-mini \| 0.554 \| n/a \|
	\| Naive vector RAG (paper) \| S/M \| gpt-4o \| ~0.31 \| n/a \|

	**Feather + GPT-4o (0.693) beats the LongMemEval paper's full-context GPT-4o
	ceiling (0.640).** Our 10-snippet retrieval carries more useful signal to the
	answerer than dumping the whole 115K-token haystack into a frontier model — at
	~40× lower input-token cost per query.

	#### Per-axis (Feather + GPT-4o on _S_)

	\| Axis \| Score \|
	\|---\|---\|
	\| single-session-user \| 1.000 (perfect) \|
	\| single-session-assistant \| 0.964 \|
	\| single-session-preference \| 0.767 \|
	\| knowledge-update \| 0.714 \|
	\| multi-session-reasoning \| 0.606 \|
	\| temporal-reasoning \| 0.477 \|

	### ANN performance — SIFT1M (real data, 500K × 128-dim)

	\| ef \| p50 latency \| p99 latency \| Recall@10 \|
	\|---\|---\|---\|---\|
	\| 10 \| 0.07 ms \| 0.13 ms \| 0.774 \|
	\| 50 (default) \| 0.19 ms \| 0.23 ms \| 0.972 \|
	\| 100 \| 0.32 ms \| 0.39 ms \| 0.991 \|
	\| 200 \| 0.56 ms \| 0.69 ms \| 0.998 \|

	### Reproduce

	```bash
	pip install feather-db
	git clone https://github.com/feather-store/feather && cd feather

	python -m bench run longmemeval --dataset s --limit 0 \\
	--embedder openai \\
	--answerer-provider gemini --answerer-model gemini-2.5-flash \\
	--judge llm --judge-provider gemini --judge-model gemini-2.0-flash \\
	--decay-half-life 14 --decay-time-weight 0.4 --k 10
	```

	### Audit trail

	Per-run JSON results — every number above is one of these files:
	- HuggingFace Dataset: [Hawky-ai/feather-db-benchmarks](https://huggingface.co/datasets/Hawky-ai/feather-db-benchmarks)
	- GitHub: [`bench/results/`](https://github.com/feather-store/feather/tree/master/bench/results)
	- Full report: [`docs/benchmarks/longmemeval.md`](https://github.com/feather-store/feather/blob/master/docs/benchmarks/longmemeval.md)
	- arXiv paper: [`docs/featherdb_paper.pdf`](https://github.com/feather-store/feather/blob/master/docs/featherdb_paper.pdf)
	""")

	# ── Search ────────────────────────────────────────────────────────────
	with gr.TabItem("🔍 Semantic Search"):
	gr.Markdown("Find nodes by meaning, not keywords. Filtered by product or entity type.")
	with gr.Row():
	with gr.Column(scale=2):
	s_query = gr.Textbox(label="Query",
	placeholder="Why is user onboarding failing?")
	s_k = gr.Slider(1, 10, value=5, step=1, label="Top-k")
	s_product = gr.Dropdown(["All","Editor","CLI","SDK","Cloud"],
	value="All", label="Product filter")
	s_btn = gr.Button("Search", variant="primary")
	with gr.Column(scale=3):
	s_out = gr.Code(label="Results", language="json",
	elem_classes=["tool-output"])

	gr.Examples(
	examples=[
	["Why is user onboarding failing?", 5, "All"],
	["What competitor moves should we watch?", 5, "All"],
	["Which features drive retention?", 5, "SDK"],
	["What does the community want most?", 5, "Cloud"],
	["Where should we invest in the editor?", 5, "Editor"],
	],
	inputs=[s_query, s_k, s_product],
	)
	s_btn.click(do_search, [s_query, s_k, s_product], s_out)

	# ── Context Chain ─────────────────────────────────────────────────────
	with gr.TabItem("🕸️ Context Chain"):
	gr.Markdown(
	"Two-phase retrieval — vector search finds seed nodes (hop 0), "
	"then BFS expands outward over typed graph edges.\n\n"
	"Use this to trace root causes: start from a symptom, surface the events that explain it."
	)
	with gr.Row():
	with gr.Column(scale=2):
	c_query = gr.Textbox(label="Seed query",
	placeholder="CLI adoption is slow")
	c_k = gr.Slider(1, 5, value=3, step=1, label="Seed nodes (k)")
	c_hops = gr.Slider(1, 3, value=2, step=1, label="Graph hops")
	c_btn = gr.Button("Run Context Chain", variant="primary")
	with gr.Column(scale=3):
	c_out = gr.Code(label="Chain result", language="json",
	elem_classes=["tool-output"])

	gr.Examples(
	examples=[
	["CLI adoption is slow", 3, 2],
	["Why is the competitor threat serious?", 3, 2],
	["What drives long-term user retention?", 3, 2],
	["Why do enterprise deals stall?", 3, 1],
	],
	inputs=[c_query, c_k, c_hops],
	)
	c_btn.click(do_context_chain, [c_query, c_k, c_hops], c_out)

	# ── Why Retrieved ─────────────────────────────────────────────────────
	with gr.TabItem("🔬 Why Retrieved?"):
	gr.Markdown(
	"Score breakdown for any node — similarity, stickiness (recall bonus), "
	"recency (adaptive decay), importance, confidence, and the full formula.\n\n"
	"Use to understand and debug retrieval decisions."
	)
	with gr.Row():
	with gr.Column(scale=2):
	w_id = gr.Number(label="Node ID (1–10)", value=4, precision=0)
	w_query = gr.Textbox(label="Query",
	placeholder="onboarding drop-off")
	w_btn = gr.Button("Explain", variant="primary")
	with gr.Column(scale=3):
	w_out = gr.Code(label="Score breakdown", language="json",
	elem_classes=["tool-output"])

	gr.Examples(
	examples=[
	[4, "onboarding drop-off time to value"],
	[2, "competitor launch editor feature"],
	[9, "retention power users habit"],
	[7, "offline mode enterprise security"],
	[6, "strategy brief Q2 auth friction"],
	],
	inputs=[w_id, w_query],
	)
	w_btn.click(do_why, [w_id, w_query], w_out)

	# ── Health ────────────────────────────────────────────────────────────
	with gr.TabItem("🩺 Graph Health"):
	gr.Markdown(
	"Snapshot of the knowledge graph: hot / warm / cold tier distribution, "
	"orphan nodes, expired TTL count, recall histogram, avg importance and confidence."
	)
	h_btn = gr.Button("Run Health Check", variant="primary")
	h_out = gr.Code(label="Health report", language="json",
	elem_classes=["tool-output"])
	h_btn.click(do_health, [], h_out)

	# ── Add Intel ─────────────────────────────────────────────────────────
	with gr.TabItem("➕ Add Intel"):
	gr.Markdown(
	"Ingest a new intelligence node into the live graph. "
	"It becomes immediately searchable — try adding something then switching to Search."
	)
	with gr.Row():
	with gr.Column():
	a_content = gr.Textbox(
	label="Content", lines=3,
	placeholder="Competitor Y just open-sourced their SDK. "
	"10k stars overnight. Targets our developer acquisition funnel.",
	)
	a_etype = gr.Dropdown(
	["competitor_intel", "feature_performance", "user_feedback",
	"strategy_brief", "market_signal", "community_signal", "channel_insight"],
	value="competitor_intel", label="Entity Type",
	)
	a_product = gr.Dropdown(["Editor","CLI","SDK","Cloud"],
	value="SDK", label="Product")
	a_importance = gr.Slider(0.0, 1.0, value=0.85, step=0.05,
	label="Importance")
	a_btn = gr.Button("Add to Graph", variant="primary")
	with gr.Column():
	a_out = gr.Code(label="Result", language="json",
	elem_classes=["tool-output"])

	a_btn.click(do_add, [a_content, a_etype, a_product, a_importance], a_out)

	gr.Markdown("""
	---
	Connect Feather DB to any LLM in 5 lines:
	```python
	pip install feather-db

	from feather_db.integrations import ClaudeConnector
	conn = ClaudeConnector(db_path="my.feather", dim=3072, embedder=embed_fn)
	result = conn.run_loop(client,
	messages=[{"role": "user", "content": "Why is onboarding drop-off so high?"}],
	model="claude-opus-4-6")
	```
	Works with Claude · OpenAI · Gemini · Groq · Mistral · Ollama · MCP (Claude Desktop, Cursor)

	[getfeather.store](https://www.getfeather.store/) · [Hawky.ai](https://hawky.ai) · [PyPI](https://pypi.org/project/feather-db/) · [GitHub](https://github.com/feather-store/feather) · [Integrations Guide](https://github.com/feather-store/feather/blob/main/docs/integrations.md)
	""")

	if __name__ == "__main__":
	demo.launch()