Getting Started
Tip: All examples below are self-contained. Copy any block into a Python file or the
Python
usql shell and run it directly.
from uqa.engine import Engine
# Create an in-memory engine (or pass db_path for persistence)
engine = Engine()
# Create a table and insert data
engine.sql("""
CREATE TABLE books (
id SERIAL PRIMARY KEY,
title TEXT NOT NULL,
author TEXT NOT NULL,
year INTEGER,
rating REAL
)
""")
engine.sql("""INSERT INTO books (title, author, year, rating) VALUES
('The Pragmatic Programmer', 'Hunt & Thomas', 1999, 4.6),
('Clean Code', 'Robert Martin', 2008, 4.4),
('Designing Data-Intensive Applications', 'Martin Kleppmann', 2017, 4.8),
('Structure and Interpretation of Computer Programs', 'Abelson & Sussman', 1984, 4.5),
('Introduction to Algorithms', 'Cormen et al.', 2009, 4.3)
""")
# Query
result = engine.sql("""
SELECT title, author, rating
FROM books
WHERE year >= 2000
ORDER BY rating DESC
""")
for row in result:
print(f"{row['title']}: {row['rating']}")
engine.close()
Full-Text Search
SQLBM25 Search
-- Standard BM25 full-text search
SELECT title, _score FROM papers
WHERE text_match(title, 'attention mechanism transformer')
ORDER BY _score DESC;
-- Bayesian BM25: calibrated P(relevant) in [0,1]
SELECT title, _score FROM papers
WHERE bayesian_match(title, 'deep learning optimization')
ORDER BY _score DESC;
-- Combine text search with relational filters
SELECT title, year, _score FROM papers
WHERE text_match(title, 'neural network')
AND year >= 2020
AND citations > 100
ORDER BY _score DESC LIMIT 10;
Python
Fluent API Text Search
result = (
engine.query(table="papers")
.term("attention", field="title")
.score_bayesian_bm25("attention")
.execute()
)
for entry in result.entries:
print(f"doc_id={entry.doc_id}, score={entry.payload.score:.4f}")
Vector Search
SQLKNN with HNSW Index
-- Create table with vector column
CREATE TABLE documents (
id SERIAL PRIMARY KEY,
title TEXT,
embedding VECTOR(4)
);
-- Insert documents with embeddings
INSERT INTO documents (title, embedding) VALUES
('machine learning basics', ARRAY[0.9, 0.1, 0.0, 0.2]),
('deep neural networks', ARRAY[0.8, 0.3, 0.1, 0.1]),
('natural language processing', ARRAY[0.2, 0.8, 0.7, 0.1]),
('computer vision models', ARRAY[0.7, 0.2, 0.3, 0.5]),
('reinforcement learning', ARRAY[0.6, 0.1, 0.2, 0.8]);
-- Brute-force exact KNN (no index)
SELECT title, _score FROM documents
WHERE knn_match(embedding, ARRAY[0.85, 0.15, 0.05, 0.15], 3)
ORDER BY _score DESC;
-- Create HNSW index for faster approximate search
CREATE INDEX idx_doc_emb ON documents USING hnsw (embedding)
WITH (ef_construction = 128, m = 16);
-- Same query now uses HNSW (transparent to the user)
SELECT title, _score FROM documents
WHERE knn_match(embedding, ARRAY[0.85, 0.15, 0.05, 0.15], 3)
ORDER BY _score DESC;
Python
Fluent API Vector Search
query_vec = [0.85, 0.15, 0.05, 0.15]
# KNN search
result = (
engine.query(table="documents")
.knn(query_vec, k=3, field="embedding")
.execute()
)
# Threshold-based search
result = (
engine.query(table="documents")
.vector(query_vec, threshold=0.7, field="embedding")
.execute()
)
Hybrid Fusion
Combine text, vector, and graph signals into a single relevance score.
SQL-- Log-odds fusion: text + vector
SELECT title, _score FROM papers
WHERE fuse_log_odds(
text_match(title, 'attention transformer'),
knn_match(embedding, ARRAY[0.1, 0.2, 0.3, 0.4], 10)
)
ORDER BY _score DESC LIMIT 5;
-- Three-signal fusion: text + vector + graph
SELECT title, _score FROM papers
WHERE fuse_log_odds(
text_match(title, 'attention'),
knn_match(embedding, ARRAY[0.1, 0.2, ...], 5),
traverse_match(1, 'cited_by', 2)
) AND year >= 2020
ORDER BY _score DESC;
-- Probabilistic boolean: find papers matching text OR similar vectors
SELECT title, _score FROM papers
WHERE fuse_prob_or(
bayesian_match(title, 'neural network'),
knn_match(embedding, ARRAY[...], 10)
)
ORDER BY _score DESC;
-- Exclude certain topics with probabilistic NOT
SELECT title, _score FROM papers
WHERE fuse_prob_and(
text_match(title, 'machine learning'),
fuse_prob_not(text_match(title, 'supervised'))
)
ORDER BY _score DESC;
Python
# Fluent API: multi-signal fusion
result = (
engine.query(table="papers")
.term("attention", field="title")
.score_bayesian_bm25("attention")
.knn(query_vector, k=10, field="embedding")
.fuse_log_odds()
.execute()
)
for entry in result.top_k(5):
print(f"doc_id={entry.doc_id}, score={entry.payload.score:.4f}")
JOINs & Subqueries
SQL-- Setup
CREATE TABLE departments (id SERIAL PRIMARY KEY, name TEXT, budget INTEGER);
CREATE TABLE employees (
id SERIAL PRIMARY KEY, name TEXT,
dept_id INTEGER REFERENCES departments(id), salary INTEGER
);
INSERT INTO departments (name, budget) VALUES
('Engineering', 500000), ('Marketing', 200000), ('Research', 300000);
INSERT INTO employees (name, dept_id, salary) VALUES
('Alice', 1, 120000), ('Bob', 1, 110000),
('Carol', 2, 95000), ('Dave', 3, 130000),
('Eve', 3, 125000);
-- JOIN with aggregation
SELECT d.name AS department,
COUNT(*) AS headcount,
AVG(e.salary) AS avg_salary
FROM employees e
INNER JOIN departments d ON e.dept_id = d.id
GROUP BY d.name
ORDER BY avg_salary DESC;
-- Employees earning above their department average
SELECT e.name, e.salary, dept_avg.avg_sal
FROM employees e
JOIN (SELECT dept_id, AVG(salary) AS avg_sal
FROM employees GROUP BY dept_id) dept_avg
ON e.dept_id = dept_avg.dept_id
WHERE e.salary > dept_avg.avg_sal;
-- Top earner per department (LATERAL subquery)
SELECT d.name, top.name AS top_earner, top.salary
FROM departments d,
LATERAL (SELECT name, salary FROM employees e
WHERE e.dept_id = d.id
ORDER BY salary DESC LIMIT 1) top;
-- Recursive org chart
CREATE TABLE org (id SERIAL, name TEXT, manager_id INTEGER);
INSERT INTO org (name, manager_id) VALUES
('CEO', NULL), ('VP Eng', 1), ('VP Sales', 1),
('Lead Dev', 2), ('Sr Dev', 4);
WITH RECURSIVE tree AS (
SELECT id, name, 0 AS depth FROM org WHERE manager_id IS NULL
UNION ALL
SELECT o.id, o.name, t.depth + 1
FROM org o JOIN tree t ON o.manager_id = t.id
)
SELECT REPEAT(' ', depth) || name AS org_chart FROM tree;
Analytics
SQLWindow Functions
CREATE TABLE sales (
id SERIAL, rep TEXT, region TEXT,
amount INTEGER, sale_date DATE
);
INSERT INTO sales (rep, region, amount, sale_date) VALUES
('Alice', 'East', 500, '2024-01-15'),
('Alice', 'East', 300, '2024-02-10'),
('Bob', 'West', 700, '2024-01-20'),
('Bob', 'West', 400, '2024-03-05'),
('Carol', 'East', 600, '2024-02-28');
-- Running total per rep
SELECT rep, sale_date, amount,
SUM(amount) OVER (PARTITION BY rep ORDER BY sale_date
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_total
FROM sales;
-- Rank within region
SELECT rep, region, SUM(amount) AS total,
RANK() OVER (PARTITION BY region ORDER BY SUM(amount) DESC) AS rnk
FROM sales
GROUP BY rep, region;
Aggregates with FILTER
-- Conditional aggregation
SELECT region,
SUM(amount) AS total,
SUM(amount) FILTER (WHERE amount >= 500) AS big_deals,
COUNT(*) FILTER (WHERE amount < 500) AS small_deal_count
FROM sales GROUP BY region;
-- Monthly summary with date functions
SELECT DATE_TRUNC('month', sale_date) AS month,
COUNT(*) AS num_sales,
SUM(amount) AS revenue,
ROUND(AVG(amount), 2) AS avg_deal
FROM sales
GROUP BY DATE_TRUNC('month', sale_date)
ORDER BY month;
JSON Analytics
CREATE TABLE events (id SERIAL, data JSONB);
INSERT INTO events (data) VALUES
('{"type": "click", "page": "/home", "duration": 5}'::jsonb),
('{"type": "click", "page": "/about", "duration": 3}'::jsonb),
('{"type": "view", "page": "/home", "duration": 12}'::jsonb);
SELECT data->>'type' AS event_type,
data->>'page' AS page,
AVG(CAST(data->>'duration' AS INTEGER)) AS avg_duration
FROM events
GROUP BY data->>'type', data->>'page';
Graph Queries
SQLSocial Network
-- Create named graph
SELECT * FROM create_graph('social');
-- Build the graph
SELECT * FROM cypher('social', $$
CREATE (a:Person {name: 'Alice', age: 30, city: 'Seoul'})
CREATE (b:Person {name: 'Bob', age: 25, city: 'Tokyo'})
CREATE (c:Person {name: 'Carol', age: 35, city: 'Seoul'})
CREATE (d:Person {name: 'Dave', age: 28, city: 'London'})
CREATE (a)-[:KNOWS {since: 2020}]->(b)
CREATE (a)-[:KNOWS {since: 2019}]->(c)
CREATE (b)-[:KNOWS {since: 2021}]->(d)
CREATE (c)-[:KNOWS {since: 2022}]->(d)
RETURN a.name
$$) AS (name agtype);
-- Friends of friends
SELECT * FROM cypher('social', $$
MATCH (a:Person {name: 'Alice'})-[:KNOWS]->(friend)-[:KNOWS]->(fof)
WHERE fof.name <> 'Alice'
RETURN DISTINCT fof.name, fof.city
$$) AS (name agtype, city agtype);
-- Variable-length path: reachable within 3 hops
SELECT * FROM cypher('social', $$
MATCH (a:Person {name: 'Alice'})-[:KNOWS*1..3]->(reachable)
RETURN DISTINCT reachable.name
$$) AS (name agtype);
-- MERGE: create if not exists, update if exists
SELECT * FROM cypher('social', $$
MERGE (p:Person {name: 'Eve'})
ON CREATE SET p.age = 26
ON MATCH SET p.last_seen = 'today'
RETURN p.name, p.age
$$) AS (name agtype, age agtype);
-- Aggregation in Cypher
SELECT * FROM cypher('social', $$
MATCH (p:Person)
WITH p.city AS city, COUNT(*) AS cnt
WHERE cnt > 1
RETURN city, cnt
ORDER BY cnt DESC
$$) AS (city agtype, cnt agtype);
SQL
SQL Graph Functions
-- BFS traversal from vertex 1, up to 2 hops via 'cited_by'
SELECT _doc_id, title FROM traverse(1, 'cited_by', 2);
-- Regular path query: 2-hop cited_by chain
SELECT _doc_id, title FROM rpq('cited_by/cited_by', 1);
-- Graph reachability as a scored signal (for fusion)
SELECT title, _score FROM papers
WHERE traverse_match(1, 'cited_by', 3)
ORDER BY _score DESC;
SQL
Graph Centrality & Weighted Paths
-- PageRank as FROM table source
SELECT _doc_id, title, _score FROM pagerank() ORDER BY _score DESC;
-- HITS and betweenness centrality
SELECT _doc_id, title, _score FROM hits() ORDER BY _score DESC;
SELECT _doc_id, title, _score FROM betweenness() ORDER BY _score DESC;
-- Centrality on named graph
SELECT _doc_id, _score FROM pagerank('graph:citations');
-- Centrality as WHERE signal + relational filter
SELECT title, _score FROM papers
WHERE pagerank() AND year >= 2020 ORDER BY _score DESC;
-- Bounded RPQ: 2-3 hops
SELECT _doc_id, title FROM rpq('cites{2,3}', 7);
-- Weighted path query with threshold
SELECT title, _score FROM papers
WHERE weighted_rpq('cites/cites', 7, 'weight', 'sum', 6.0);
-- Graph mutation via SQL
SELECT * FROM graph_add_vertex(1, 'person', 'employees');
SELECT * FROM graph_add_edge(1, 1, 2, 'knows', 'employees');
-- Fusion: text + centrality + gating
SELECT title, _score FROM papers
WHERE fuse_log_odds(
text_match(title, 'attention'),
pagerank(), 'relu'
) ORDER BY _score DESC;
-- Progressive fusion: cascading WAND
SELECT title, _score FROM papers
WHERE progressive_fusion(
text_match(title, 'attention'),
traverse_match(1, 'cites', 2), 5,
pagerank(), 3
) ORDER BY _score DESC;
Foreign Data Wrappers & Parquet
SQLDuckDB FDW with Hive Partitioning
-- Register DuckDB server
CREATE SERVER dw FOREIGN DATA WRAPPER duckdb_fdw;
-- Map a Parquet directory with Hive partitioning
CREATE FOREIGN TABLE orders (
order_id INTEGER, customer TEXT, amount REAL,
year INTEGER, month INTEGER
) SERVER dw OPTIONS (
source '/data/orders/**/*.parquet',
hive_partitioning 'true'
);
-- Query with predicate pushdown (partition pruning)
SELECT customer, SUM(amount) AS total
FROM orders
WHERE year = 2025 AND month BETWEEN 1 AND 3
GROUP BY customer
ORDER BY total DESC LIMIT 10;
-- Join foreign table with local table
SELECT o.customer, e.name AS rep, SUM(o.amount)
FROM orders o
JOIN employees e ON o.customer = e.name
GROUP BY o.customer, e.name;
CSV Files
CREATE FOREIGN TABLE logs (
ts TEXT, level TEXT, msg TEXT
) SERVER dw OPTIONS (source '/var/log/app.csv');
SELECT level, COUNT(*) FROM logs GROUP BY level;
Text Analysis Pipeline
Pythonfrom uqa.analysis import (
Analyzer, StandardTokenizer, LetterTokenizer,
NGramTokenizer, KeywordTokenizer,
LowerCaseFilter, StopWordFilter, PorterStemFilter,
ASCIIFoldingFilter, SynonymFilter, EdgeNGramFilter,
HTMLStripCharFilter, MappingCharFilter,
)
# Standard English analyzer with stemming
english = Analyzer(
tokenizer=StandardTokenizer(),
token_filters=[
LowerCaseFilter(),
StopWordFilter(language="english"),
PorterStemFilter(),
],
)
print(english.analyze("The quick brown foxes are jumping"))
# ['quick', 'brown', 'fox', 'jump']
# Autocomplete analyzer with edge n-grams
autocomplete = Analyzer(
tokenizer=StandardTokenizer(),
token_filters=[
LowerCaseFilter(),
EdgeNGramFilter(min_gram=2, max_gram=8),
],
)
print(autocomplete.analyze("machine learning"))
# ['ma', 'mac', 'mach', ..., 'le', 'lea', 'lear', ...]
# Synonym expansion
synonyms = Analyzer(
tokenizer=StandardTokenizer(),
token_filters=[
LowerCaseFilter(),
SynonymFilter(synonyms={"ml": ["machine learning"], "ai": ["artificial intelligence"]}),
],
)
print(synonyms.analyze("ml models"))
# ['ml', 'machine learning', 'models']
# HTML stripping
html_analyzer = Analyzer(
char_filters=[HTMLStripCharFilter()],
tokenizer=StandardTokenizer(),
token_filters=[LowerCaseFilter()],
)
print(html_analyzer.analyze("<p>Hello <b>World</b></p>"))
# ['hello', 'world']
SQL
-- Create analyzer via SQL
SELECT * FROM create_analyzer('my_english', '{
"tokenizer": {"type": "standard"},
"token_filters": [
{"type": "lowercase"},
{"type": "stop", "language": "english"},
{"type": "porter_stem"}
],
"char_filters": [{"type": "html_strip"}]
}');
-- List all analyzers
SELECT * FROM list_analyzers();
-- Drop a custom analyzer
SELECT * FROM drop_analyzer('my_english');
Arrow / Parquet Export
Pythonfrom uqa.engine import Engine
engine = Engine()
engine.sql("CREATE TABLE t (id SERIAL, name TEXT, score REAL)")
engine.sql("INSERT INTO t (name, score) VALUES ('a', 1.5), ('b', 2.3), ('c', 0.8)")
# SQL result to Arrow Table (zero-copy from execution engine)
result = engine.sql("SELECT * FROM t ORDER BY score DESC")
arrow_table = result.to_arrow()
print(arrow_table.schema)
print(arrow_table.to_pandas())
# SQL result to Parquet file
result.to_parquet("output.parquet")
# Fluent API to Arrow
arrow_table = (
engine.query(table="t")
.execute_arrow()
)
# Fluent API to Parquet
engine.query(table="t").execute_parquet("fluent_output.parquet")
engine.close()
Fluent QueryBuilder API
PythonBoolean Composition
from uqa.engine import Engine
from uqa.core.types import Equals, GreaterThanOrEqual, Between
engine = Engine()
# ... (create and populate table)
# Boolean AND: intersection of two term queries
q1 = engine.query(table="papers").term("neural", field="title")
q2 = engine.query(table="papers").term("network", field="title")
both = q1.and_(q2).execute()
# Boolean OR: union of results
either = q1.or_(q2).execute()
# Relational filter
result = (
engine.query(table="papers")
.filter("year", GreaterThanOrEqual(2020))
.filter("citations", Between(100, 10000))
.execute()
)
# Facets: count documents per field value
facets = engine.query(table="papers").facet("category")
for value, count in facets.items():
print(f"{value}: {count}")
EXPLAIN
# View the query plan
plan = (
engine.query(table="papers")
.term("attention", field="title")
.score_bm25("attention")
.knn(query_vector, k=5, field="embedding")
.fuse_log_odds()
.explain()
)
print(plan)
Hierarchical Data
Pythonfrom uqa.core.types import Equals
# Add nested documents
engine.add_document(table="orders", document={
"customer": "Alice",
"shipping": {"city": "Seoul", "country": "KR"},
"items": [
{"name": "Widget", "price": 25.00, "qty": 3},
{"name": "Gadget", "price": 49.99, "qty": 1},
]
})
# Path filter: find orders shipping to Seoul
result = (
engine.query(table="orders")
.path_filter("shipping.city", Equals("Seoul"))
.execute()
)
# Path aggregate: total price across items
result = (
engine.query(table="orders")
.path_aggregate("items.price", "sum")
.execute()
)
SQL
-- SQL equivalents
SELECT * FROM orders WHERE path_filter('shipping.city', 'Seoul');
SELECT *, path_agg('items.price', 'sum') AS total FROM orders;
SELECT path_value('shipping.country') AS country FROM orders;
Source Files
Complete runnable examples are in the repository:
Quickstart
examples/quickstart.py
Hybrid search (text + vector + fusion) in under 30 lines
SQL Examples
examples/sql/
basics, functions, graph, graph_centrality, fusion, fusion_gating, joins_and_subqueries, analytics, analysis, export, fdw
Fluent API Examples
examples/fluent/
text_search, vector_and_hybrid, graph, graph_centrality, hierarchical, multi_paradigm, analysis, export
# Quickstart
python examples/quickstart.py
# Run all examples
python examples/sql/basics.py
python examples/sql/functions.py
python examples/sql/graph.py
python examples/sql/fusion.py
python examples/sql/joins_and_subqueries.py
python examples/sql/analytics.py
python examples/sql/analysis.py
python examples/sql/export.py
python examples/sql/fdw.py
python examples/fluent/text_search.py
python examples/fluent/vector_and_hybrid.py
python examples/fluent/graph.py
python examples/fluent/hierarchical.py
python examples/fluent/multi_paradigm.py
python examples/fluent/analysis.py
python examples/fluent/export.py