`
```
+## Benchmark 4: Geospatial Bounding Box Query
+
+This benchmark counts samples within the western United States (lat 32-49, lon -125 to -104), comparing lat/lon filtering on wide format versus H3-cell-based filtering on the H3-indexed file.
+
+
+Running Benchmark 4...
+
+
+```{ojs}
+benchmark4 = {
+ if (runBenchmarks < 1 || !benchmark3 || benchmark3.error) return null;
+
+ const loadingDiv = document.getElementById('loading_b4');
+ const errorDiv = document.getElementById('error_display');
+ if (loadingDiv) loadingDiv.hidden = false;
+
+ // Wide: baseline lat/lon bounding box
+ const wideQuery = `
+ SELECT COUNT(*) as cnt FROM wide
+ WHERE otype = 'MaterialSampleRecord'
+ AND latitude BETWEEN 32 AND 49
+ AND longitude BETWEEN -125 AND -104
+ `;
+
+ // Wide+H3: H3-accelerated — first identify cells in the bbox, then filter by cells
+ const h3Query = `
+ WITH cells AS (
+ SELECT DISTINCT h3_res4 FROM wide_h3
+ WHERE latitude BETWEEN 32 AND 49
+ AND longitude BETWEEN -125 AND -104
+ AND otype = 'MaterialSampleRecord'
+ )
+ SELECT COUNT(*) as cnt FROM wide_h3
+ WHERE h3_res4 IN (SELECT h3_res4 FROM cells)
+ AND otype = 'MaterialSampleRecord'
+ `;
+
+ const runs = 3;
+
+ try {
+ const wideTimes = [];
+ for (let i = 0; i < runs; i++) {
+ const start = performance.now();
+ await dbWide.query(wideQuery);
+ wideTimes.push(performance.now() - start);
+ }
+
+ const h3Times = [];
+ for (let i = 0; i < runs; i++) {
+ const start = performance.now();
+ await dbWideH3.query(h3Query);
+ h3Times.push(performance.now() - start);
+ }
+
+ const median = arr => {
+ const sorted = [...arr].sort((a, b) => a - b);
+ if (sorted.length === 2) return (sorted[0] + sorted[1]) / 2;
+ return sorted[Math.floor(sorted.length / 2)];
+ };
+ const warmMedian = arr => {
+ if (arr.length <= 1) return arr[0] || 0;
+ return median(arr.slice(1));
+ };
+
+ const wideMedian = warmMedian(wideTimes);
+ const h3Median = warmMedian(h3Times);
+
+ return {
+ name: "Geospatial BBox (Western US)",
+ wideCold: wideTimes[0],
+ wideMedian: wideMedian,
+ wideAll: wideTimes,
+ h3Cold: h3Times[0],
+ h3Median: h3Median,
+ h3All: h3Times,
+ speedup: wideMedian / h3Median
+ };
+ } catch (e) {
+ if (errorDiv) {
+ errorDiv.textContent = `Benchmark 4 failed: ${e.message}`;
+ errorDiv.style.display = 'block';
+ }
+ return { error: e.message };
+ } finally {
+ if (loadingDiv) loadingDiv.hidden = true;
+ }
+}
+```
+
+```{ojs}
+//| echo: false
+benchmark4 ? (benchmark4.error ? html`
+
+
Benchmark 4 Error
+
Benchmark 4 failed: ${benchmark4.error}
+
+` : html`
+
+
${benchmark4.name}
+
+
+
+
Schema
+
Cold (1st run)
+
Warm (median)
+
All runs
+
+
+
+
Wide (lat/lon)
+
${benchmark4.wideCold.toFixed(0)} ms
+
${benchmark4.wideMedian.toFixed(0)} ms
+
${benchmark4.wideAll.map(t => t.toFixed(0)).join(', ')} ms
+
Wide+H3 (cell filter)
+
${benchmark4.h3Cold.toFixed(0)} ms
+
${benchmark4.h3Median.toFixed(0)} ms
+
${benchmark4.h3All.map(t => t.toFixed(0)).join(', ')} ms
';
+
+ const delta = meta.resolution === 4 ? 2.0 : meta.resolution === 6 ? 0.5 : 0.1;
+ try {
+ const samples = await db.query(`
+ SELECT pid, label, source, latitude, longitude, place_name
+ FROM read_parquet('${lite_url}')
+ WHERE latitude BETWEEN ${meta.lat - delta} AND ${meta.lat + delta}
+ AND longitude BETWEEN ${meta.lng - delta} AND ${meta.lng + delta}
+ LIMIT 30
+ `);
+ updateSamples(samples);
+ } catch(err) {
+ console.error("Sample query failed:", err);
+ if (sampEl) sampEl.innerHTML = '
Query failed — try again.
';
+ }
+ }
+ }, Cesium.ScreenSpaceEventType.LEFT_CLICK);
+
+ return v;
+}
+```
+
+```{ojs}
+//| echo: false
+//| output: false
+
+// === PHASE 1: Load H3 res4 globally (instant) ===
+phase1 = {
+ performance.mark('p1-start');
+
+ const data = await db.query(`
+ SELECT h3_cell, sample_count, center_lat, center_lng,
+ dominant_source, source_count
+ FROM read_parquet('${h3_res4_url}')
+ `);
+
+ const scalar = new Cesium.NearFarScalar(1.5e2, 1.5, 8.0e6, 0.5);
+ let totalSamples = 0;
+
+ for (const row of data) {
+ const count = row.sample_count;
+ totalSamples += count;
+ const size = Math.min(3 + Math.log10(count) * 4, 20);
+ viewer.h3Points.add({
+ id: { count, source: row.dominant_source, lat: row.center_lat, lng: row.center_lng, resolution: 4 },
+ position: Cesium.Cartesian3.fromDegrees(row.center_lng, row.center_lat, 0),
+ pixelSize: size,
+ color: Cesium.Color.fromCssColorString(SOURCE_COLORS[row.dominant_source] || '#666').withAlpha(0.8),
+ scaleByDistance: scalar,
+ });
+ }
+
+ performance.mark('p1-end');
+ performance.measure('p1', 'p1-start', 'p1-end');
+ const elapsed = performance.getEntriesByName('p1').pop().duration;
+
+ updateStats('H3 Res4', data.length, totalSamples, `${(elapsed/1000).toFixed(1)}s`);
+ updatePhaseMsg(`${data.length.toLocaleString()} clusters, ${totalSamples.toLocaleString()} samples. Zoom in for finer detail.`, 'done');
+ console.log(`Phase 1: ${data.length} clusters in ${elapsed.toFixed(0)}ms`);
+
+ return { count: data.length, samples: totalSamples };
+}
+```
+
+```{ojs}
+//| echo: false
+//| output: false
+
+// === Zoom watcher: H3 cluster mode + individual sample point mode ===
+zoomWatcher = {
+ if (!phase1) return;
+
+ // --- State ---
+ let mode = 'cluster'; // 'cluster' or 'point'
+ let currentRes = 4;
+ let loading = false;
+ let requestId = 0; // stale-request guard
+
+ // Hysteresis thresholds to avoid flicker
+ const ENTER_POINT_ALT = 120000; // 120 km → enter point mode
+ const EXIT_POINT_ALT = 180000; // 180 km → exit point mode
+ const POINT_BUDGET = 5000;
+
+ // Viewport cache: avoid re-querying same area
+ let cachedBounds = null; // { south, north, west, east }
+ let cachedData = null; // array of rows
+
+ // --- H3 cluster loading (existing logic) ---
+ const loadRes = async (res, url) => {
+ if (loading) return;
+ loading = true;
+ updatePhaseMsg(`Loading H3 res${res}...`, 'loading');
+
+ try {
+ performance.mark(`r${res}-s`);
+ const data = await db.query(`
+ SELECT h3_cell, sample_count, center_lat, center_lng,
+ dominant_source, source_count
+ FROM read_parquet('${url}')
+ `);
+
+ viewer.h3Points.removeAll();
+ const scalar = new Cesium.NearFarScalar(1.5e2, 1.5, 8.0e6, 0.3);
+ let total = 0;
+
+ for (const row of data) {
+ total += row.sample_count;
+ const size = Math.min(3 + Math.log10(row.sample_count) * 3.5, 18);
+ viewer.h3Points.add({
+ id: { count: row.sample_count, source: row.dominant_source, lat: row.center_lat, lng: row.center_lng, resolution: res },
+ position: Cesium.Cartesian3.fromDegrees(row.center_lng, row.center_lat, 0),
+ pixelSize: size,
+ color: Cesium.Color.fromCssColorString(SOURCE_COLORS[row.dominant_source] || '#666').withAlpha(0.85),
+ scaleByDistance: scalar,
+ });
+ }
+
+ performance.mark(`r${res}-e`);
+ performance.measure(`r${res}`, `r${res}-s`, `r${res}-e`);
+ const elapsed = performance.getEntriesByName(`r${res}`).pop().duration;
+
+ updateStats(`H3 Res${res}`, data.length, total, `${(elapsed/1000).toFixed(1)}s`);
+ updatePhaseMsg(`${data.length.toLocaleString()} clusters, ${total.toLocaleString()} samples. ${res < 8 ? 'Zoom in for finer detail.' : 'Zoom closer for individual samples.'}`, 'done');
+
+ currentRes = res;
+ console.log(`Res${res}: ${data.length} clusters in ${elapsed.toFixed(0)}ms`);
+ } catch(err) {
+ console.error(`Failed to load res${res}:`, err);
+ updatePhaseMsg(`Failed to load H3 res${res} — try zooming again.`, 'loading');
+ } finally {
+ loading = false;
+ }
+ };
+
+ // --- Get camera viewport bounds ---
+ function getViewportBounds() {
+ const rect = viewer.camera.computeViewRectangle(viewer.scene.globe.ellipsoid);
+ if (!rect) return null;
+ return {
+ south: Cesium.Math.toDegrees(rect.south),
+ north: Cesium.Math.toDegrees(rect.north),
+ west: Cesium.Math.toDegrees(rect.west),
+ east: Cesium.Math.toDegrees(rect.east)
+ };
+ }
+
+ // --- Check if viewport is within cached bounds ---
+ function isWithinCache(bounds) {
+ if (!cachedBounds || !bounds) return false;
+ return bounds.south >= cachedBounds.south &&
+ bounds.north <= cachedBounds.north &&
+ bounds.west >= cachedBounds.west &&
+ bounds.east <= cachedBounds.east;
+ }
+
+ // --- Load individual samples for current viewport ---
+ async function loadViewportSamples() {
+ const myReqId = ++requestId;
+ const bounds = getViewportBounds();
+ if (!bounds) return;
+
+ // If viewport is within cached area, just re-render from cache
+ if (isWithinCache(bounds) && cachedData) {
+ renderSamplePoints(cachedData, bounds);
+ return;
+ }
+
+ // Fetch with 30% padding for smooth panning
+ const latPad = (bounds.north - bounds.south) * 0.3;
+ const lngPad = (bounds.east - bounds.west) * 0.3;
+ const padded = {
+ south: bounds.south - latPad,
+ north: bounds.north + latPad,
+ west: bounds.west - lngPad,
+ east: bounds.east + lngPad
+ };
+
+ updatePhaseMsg('Loading individual samples...', 'loading');
+
+ try {
+ performance.mark('sp-s');
+ const data = await db.query(`
+ SELECT pid, label, source, latitude, longitude,
+ place_name, result_time
+ FROM read_parquet('${lite_url}')
+ WHERE latitude BETWEEN ${padded.south} AND ${padded.north}
+ AND longitude BETWEEN ${padded.west} AND ${padded.east}
+ LIMIT ${POINT_BUDGET}
+ `);
+ performance.mark('sp-e');
+ performance.measure('sp', 'sp-s', 'sp-e');
+ const elapsed = performance.getEntriesByName('sp').pop().duration;
+
+ // Stale guard: discard if a newer request was issued
+ if (myReqId !== requestId) {
+ console.log(`Discarding stale sample response (req ${myReqId}, current ${requestId})`);
+ return;
+ }
+
+ // Cache the padded bounds + data
+ cachedBounds = padded;
+ cachedData = Array.from(data);
+
+ renderSamplePoints(cachedData, bounds);
+
+ updateStats('Samples', cachedData.length, cachedData.length, `${(elapsed/1000).toFixed(1)}s`);
+ updatePhaseMsg(`${cachedData.length.toLocaleString()} individual samples. Click one for details.`, 'done');
+ console.log(`Point mode: ${cachedData.length} samples in ${elapsed.toFixed(0)}ms`);
+
+ } catch(err) {
+ if (myReqId !== requestId) return;
+ console.error("Viewport sample query failed:", err);
+ updatePhaseMsg('Sample query failed — try again.', 'loading');
+ }
+ }
+
+ // --- Render sample points on globe ---
+ function renderSamplePoints(data, bounds) {
+ viewer.samplePoints.removeAll();
+ const scalar = new Cesium.NearFarScalar(1e2, 8, 2e5, 3);
+
+ for (const row of data) {
+ const color = SOURCE_COLORS[row.source] || '#666';
+ viewer.samplePoints.add({
+ id: {
+ type: 'sample',
+ pid: row.pid,
+ label: row.label,
+ source: row.source,
+ lat: row.latitude,
+ lng: row.longitude,
+ place_name: row.place_name,
+ result_time: row.result_time
+ },
+ position: Cesium.Cartesian3.fromDegrees(row.longitude, row.latitude, 0),
+ pixelSize: 6,
+ color: Cesium.Color.fromCssColorString(color).withAlpha(0.9),
+ scaleByDistance: scalar,
+ });
+ }
+ }
+
+ // --- Mode transitions ---
+ function enterPointMode(pushHistory) {
+ mode = 'point';
+ viewer._globeState.mode = 'point';
+ viewer.h3Points.show = false;
+ viewer.samplePoints.show = true;
+ if (pushHistory !== false) history.pushState(null, '', buildHash(viewer));
+ loadViewportSamples();
+ console.log('Entered point mode');
+ }
+
+ function exitPointMode(pushHistory) {
+ mode = 'cluster';
+ viewer._globeState.mode = 'cluster';
+ viewer.samplePoints.show = false;
+ viewer.samplePoints.removeAll();
+ viewer.h3Points.show = true;
+ if (pushHistory !== false) history.pushState(null, '', buildHash(viewer));
+ cachedBounds = null;
+ cachedData = null;
+
+ // Restore cluster stats
+ let clusterCount = viewer.h3Points.length;
+ updateStats(`H3 Res${currentRes}`, clusterCount, '—', '—');
+ updatePhaseMsg(`${clusterCount.toLocaleString()} clusters. Zoom closer for individual samples.`, 'done');
+ console.log('Exited point mode');
+ }
+
+ // --- Camera change handler ---
+ let timer = null;
+ viewer.camera.changed.addEventListener(() => {
+ if (timer) clearTimeout(timer);
+ timer = setTimeout(async () => {
+ const h = viewer.camera.positionCartographic.height;
+
+ // Determine target mode with hysteresis
+ const targetMode = h < ENTER_POINT_ALT ? 'point'
+ : h > EXIT_POINT_ALT ? 'cluster'
+ : mode;
+
+ if (targetMode === 'point' && mode !== 'point') {
+ // Make sure we're at res8 clusters before transitioning
+ if (currentRes !== 8 && !loading) {
+ await loadRes(8, h3_res8_url);
+ }
+ enterPointMode();
+ } else if (targetMode === 'cluster' && mode !== 'cluster') {
+ exitPointMode();
+ // Reload appropriate resolution
+ const target = h > 3000000 ? 4 : h > 300000 ? 6 : 8;
+ if (target !== currentRes && !loading) {
+ await loadRes(target, { 4: h3_res4_url, 6: h3_res6_url, 8: h3_res8_url }[target]);
+ }
+ } else if (targetMode === 'point') {
+ // Already in point mode — update viewport samples
+ loadViewportSamples();
+ } else {
+ // Cluster mode — check if resolution should change
+ const target = h > 3000000 ? 4 : h > 300000 ? 6 : 8;
+ if (target !== currentRes && !loading) {
+ await loadRes(target, { 4: h3_res4_url, 6: h3_res6_url, 8: h3_res8_url }[target]);
+ }
+ }
+
+ // Update URL hash (replaceState for continuous movement)
+ if (!viewer._suppressHashWrite) {
+ history.replaceState(null, '', buildHash(viewer));
+ }
+ }, 600);
+ });
+ viewer.camera.percentageChanged = 0.1;
+
+ // --- Handle browser back/forward ---
+ window.addEventListener('hashchange', async () => {
+ const state = readHash();
+ if (state.lat == null || state.lng == null) return;
+
+ viewer._suppressHashWrite = true;
+ clearTimeout(viewer._suppressTimer);
+ viewer.camera.cancelFlight();
+ viewer.camera.flyTo({
+ destination: Cesium.Cartesian3.fromDegrees(state.lng, state.lat, state.alt || 20000000),
+ orientation: {
+ heading: Cesium.Math.toRadians(state.heading),
+ pitch: Cesium.Math.toRadians(state.pitch)
+ },
+ duration: 1.5,
+ });
+
+ // After flight settles, force mode and clear suppress flag
+ viewer._suppressTimer = setTimeout(() => {
+ viewer._suppressHashWrite = false;
+ const s = readHash();
+ if (s.mode === 'point' && mode !== 'point') enterPointMode(false);
+ else if (s.mode !== 'point' && mode === 'point') exitPointMode(false);
+ }, 2000);
+
+ // Handle pid selection
+ if (state.pid) {
+ viewer._globeState.selectedPid = state.pid;
+ try {
+ const sample = await db.query(`
+ SELECT pid, label, source, latitude, longitude, place_name, result_time
+ FROM read_parquet('${lite_url}')
+ WHERE pid = '${state.pid.replace(/'/g, "''")}'
+ LIMIT 1
+ `);
+ if (sample && sample.length > 0) {
+ const s = sample[0];
+ updateSampleCard({
+ pid: s.pid, label: s.label, source: s.source,
+ lat: s.latitude, lng: s.longitude,
+ place_name: s.place_name, result_time: s.result_time
+ });
+ }
+ } catch(err) {
+ console.error("Hash pid query failed:", err);
+ }
+ } else {
+ viewer._globeState.selectedPid = null;
+ updateClusterCard(null);
+ }
+ });
+
+ // --- Share button ---
+ const shareBtn = document.getElementById('shareBtn');
+ if (shareBtn) {
+ shareBtn.addEventListener('click', async () => {
+ history.replaceState(null, '', buildHash(viewer));
+ try {
+ await navigator.clipboard.writeText(location.href);
+ const toast = document.getElementById('shareToast');
+ if (toast) {
+ toast.style.opacity = '1';
+ setTimeout(() => { toast.style.opacity = '0'; }, 2000);
+ }
+ } catch(err) {
+ prompt('Copy this link:', location.href);
+ }
+ });
+ }
+
+ // --- Deep-link: restore selection from initial hash ---
+ const ih = viewer._initialHash;
+ if (ih.pid) {
+ viewer._globeState.selectedPid = ih.pid;
+ try {
+ const sample = await db.query(`
+ SELECT pid, label, source, latitude, longitude, place_name, result_time
+ FROM read_parquet('${lite_url}')
+ WHERE pid = '${ih.pid.replace(/'/g, "''")}'
+ LIMIT 1
+ `);
+ if (sample && sample.length > 0) {
+ const s = sample[0];
+ updateSampleCard({
+ pid: s.pid, label: s.label, source: s.source,
+ lat: s.latitude, lng: s.longitude,
+ place_name: s.place_name, result_time: s.result_time
+ });
+ const detail = await db.query(`
+ SELECT description FROM read_parquet('${wide_url}')
+ WHERE pid = '${ih.pid.replace(/'/g, "''")}'
+ LIMIT 1
+ `);
+ if (detail && detail.length > 0) updateSampleDetail(detail[0]);
+ else updateSampleDetail({ description: '' });
+ }
+ } catch(err) {
+ console.error("Deep-link pid query failed:", err);
+ }
+ }
+
+ // Enable hash writing now that everything is initialized
+ viewer._suppressHashWrite = false;
+
+ return "active";
+}
+```
+
+## How This Demo Works
+
+Pre-aggregated H3 hexagonal indices achieve near-instant globe rendering, with seamless drill-down to individual samples:
+
+| Phase | Data | Size | Points |
+|-------|------|------|--------|
+| **Instant** | H3 res4 | 580 KB | 38K clusters (continental) |
+| **Zoom in** | H3 res6 | 1.6 MB | 112K clusters (city) |
+| **Zoom more** | H3 res8 | 2.5 MB | 176K clusters (neighborhood) |
+| **Zoom deep** | Map lite | 60 MB (range req.) | Up to 5K individual samples |
+| **Click sample** | Full dataset | ~280 MB (range req.) | Full metadata for 1 sample |
+
+**4 parquet files, zero backend.** All queries run in your browser via DuckDB-WASM with HTTP range requests — only the bytes you need are transferred.
+
+## See Also
+
+- [Cesium Globe (All Points)](/tutorials/parquet_cesium_isamples_wide.html) — Full point-level rendering
+- [Interactive Explorer](/tutorials/isamples_explorer.html) — Search and filter with facets
+- [Deep-Dive Analysis](/tutorials/zenodo_isamples_analysis.html) — DuckDB-WASM SQL tutorial
diff --git a/tutorials/zenodo_isamples_analysis.qmd b/tutorials/zenodo_isamples_analysis.qmd
index 48793f9..97b0e56 100644
--- a/tutorials/zenodo_isamples_analysis.qmd
+++ b/tutorials/zenodo_isamples_analysis.qmd
@@ -27,12 +27,17 @@ This tutorial demonstrates how to efficiently analyze large geospatial datasets
## Dataset Information
-**Primary dataset** (Jan 2026):
-- **URL**: `https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet`
-- **Size**: ~280 MB wide format, 6.7M MaterialSampleRecords (20M total rows)
+**Primary dataset** (Jan 2026, H3-indexed):
+- **URL**: `https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide_h3.parquet`
+- **Size**: ~292 MB wide format with H3 indices, 6.7M MaterialSampleRecords (20M total rows)
+- **H3 columns**: Pre-computed `h3_res4`, `h3_res6`, `h3_res8` (BIGINT) for spatial grouping
- **Sources**: SESAR (4.6M), OpenContext (1M), GEOME (605K), Smithsonian (322K)
- **Hosting**: Cloudflare R2 with HTTP range request support
+**Facet summaries** (2KB, instant):
+- **URL**: `https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_facet_summaries.parquet`
+- **Schema**: `facet_type`, `facet_value`, `scheme`, `count`
+
**Note**: *Data was originally archived on Zenodo and is now served from Cloudflare R2 for better performance and reliability.*
**Fallback dataset** (if remote data fails):
@@ -81,8 +86,11 @@ d3 = require("d3@7")
topojson = require("topojson-client@3")
// Dataset URLs - try multiple options for CORS compatibility
-// Primary: Cloudflare R2 (Jan 2026 wide format)
+// Primary: Cloudflare R2 (Jan 2026 wide format with H3 indices)
parquet_urls = [
+ 'https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide_h3.parquet',
+
+ // Fallback: original wide format without H3
'https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet',
// Fallback: older versions
@@ -90,6 +98,9 @@ parquet_urls = [
'https://zenodo.org/api/records/15278211/files/isamples_export_2025_04_21_16_23_46_geo.parquet/content'
]
+// Pre-computed facet summaries (2KB - loads instantly)
+facet_summaries_url = 'https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_facet_summaries.parquet'
+
// Test CORS and find working URL - with rate limiting protection
working_parquet_url = {
// Check if we've recently failed (to avoid repeated rate limiting)
@@ -419,36 +430,89 @@ geo_stats = {
};
}
-// Regional analysis using bounding boxes
+// Data-driven regional analysis using H3 res4 cell grouping
+// Replaces hardcoded CASE WHEN bounding boxes with dynamic discovery
regional_data = {
- const result = await db.query(`
- SELECT
- CASE
- WHEN sample_location_longitude BETWEEN -125 AND -66
- AND sample_location_latitude BETWEEN 24 AND 50 THEN 'North America'
- WHEN sample_location_longitude BETWEEN -11 AND 40
- AND sample_location_latitude BETWEEN 35 AND 71 THEN 'Europe'
- WHEN sample_location_longitude BETWEEN 95 AND 141
- AND sample_location_latitude BETWEEN 18 AND 54 THEN 'East Asia'
- WHEN sample_location_longitude BETWEEN 113 AND 154
- AND sample_location_latitude BETWEEN -44 AND -10 THEN 'Australia'
- ELSE 'Other'
- END as region,
- source_collection,
- count(*) as sample_count,
- avg(sample_location_latitude) as avg_lat,
- avg(sample_location_longitude) as avg_lon
- FROM isamples_data
- WHERE sample_location_latitude IS NOT NULL
- AND sample_location_longitude IS NOT NULL
- GROUP BY 1, 2
- ORDER BY region, sample_count DESC
- `);
- // Convert BigInt values to Numbers
- return result.toArray().map(row => ({
- ...row,
- sample_count: Number(row.sample_count)
- }));
+ // Check if h3_res4 column exists (H3-indexed file)
+ let hasH3 = false;
+ try {
+ const colCheck = await db.query(`SELECT h3_res4 FROM isamples_data LIMIT 1`);
+ hasH3 = true;
+ } catch (e) {
+ hasH3 = false;
+ }
+
+ if (hasH3) {
+ // H3-based regional grouping: discover dense clusters dynamically
+ const result = await db.query(`
+ SELECT
+ h3_res4,
+ COUNT(*) as sample_count,
+ AVG(sample_location_latitude) as avg_lat,
+ AVG(sample_location_longitude) as avg_lon,
+ COUNT(DISTINCT source_collection) as source_count,
+ MODE(source_collection) as dominant_source
+ FROM isamples_data
+ WHERE sample_location_latitude IS NOT NULL
+ AND sample_location_longitude IS NOT NULL
+ AND h3_res4 IS NOT NULL
+ GROUP BY h3_res4
+ HAVING COUNT(*) > 100
+ ORDER BY sample_count DESC
+ `);
+ // Assign region labels based on centroid location
+ return result.toArray().map(row => {
+ const lat = row.avg_lat;
+ const lon = row.avg_lon;
+ let region;
+ if (lon >= -130 && lon <= -60 && lat >= 20 && lat <= 55) region = 'North America';
+ else if (lon >= -15 && lon <= 45 && lat >= 30 && lat <= 75) region = 'Europe';
+ else if (lon >= 90 && lon <= 150 && lat >= 15 && lat <= 55) region = 'East Asia';
+ else if (lon >= 110 && lon <= 160 && lat >= -50 && lat <= -5) region = 'Australia';
+ else if (lon >= -90 && lon <= -30 && lat >= -60 && lat <= 15) region = 'South America';
+ else if (lon >= -20 && lon <= 55 && lat >= -40 && lat <= 30) region = 'Africa';
+ else region = 'Other';
+ return {
+ region,
+ source_collection: row.dominant_source,
+ sample_count: Number(row.sample_count),
+ avg_lat: row.avg_lat,
+ avg_lon: row.avg_lon,
+ h3_cell: row.h3_res4,
+ source_count: Number(row.source_count)
+ };
+ });
+ } else {
+ // Fallback for non-H3 files: use simple lat/lon-based grouping
+ const result = await db.query(`
+ SELECT
+ source_collection,
+ count(*) as sample_count,
+ avg(sample_location_latitude) as avg_lat,
+ avg(sample_location_longitude) as avg_lon
+ FROM isamples_data
+ WHERE sample_location_latitude IS NOT NULL
+ AND sample_location_longitude IS NOT NULL
+ GROUP BY source_collection
+ ORDER BY sample_count DESC
+ `);
+ return result.toArray().map(row => {
+ const lat = row.avg_lat;
+ const lon = row.avg_lon;
+ let region = 'Other';
+ if (lon >= -130 && lon <= -60 && lat >= 20 && lat <= 55) region = 'North America';
+ else if (lon >= -15 && lon <= 45 && lat >= 30 && lat <= 75) region = 'Europe';
+ else if (lon >= 90 && lon <= 150 && lat >= 15 && lat <= 55) region = 'East Asia';
+ else if (lon >= 110 && lon <= 160 && lat >= -50 && lat <= -5) region = 'Australia';
+ return {
+ region,
+ source_collection: row.source_collection,
+ sample_count: Number(row.sample_count),
+ avg_lat: row.avg_lat,
+ avg_lon: row.avg_lon
+ };
+ });
+ }
}
```
@@ -461,8 +525,10 @@ md`
- **Latitude range**: ${geo_stats.min_lat.toFixed(3)}° to ${geo_stats.max_lat.toFixed(3)}°
- **Longitude range**: ${geo_stats.min_lon.toFixed(3)}° to ${geo_stats.max_lon.toFixed(3)}°
- **Average location**: ${geo_stats.avg_lat.toFixed(3)}°, ${geo_stats.avg_lon.toFixed(3)}°
-- **Total regional records**: ${regional_data.length}
-- **Regions found**: ${[...new Set(regional_data.map(d => d.region))].join(', ')}
+- **Dense H3 clusters**: ${regional_data.length} (cells with >100 samples)
+- **Regions discovered**: ${[...new Set(regional_data.map(d => d.region))].join(', ')}
+
+*Regional grouping is data-driven using H3 resolution-4 hexagonal cells, replacing hardcoded bounding boxes.*
`
```
@@ -486,27 +552,26 @@ viewof selected_region = Inputs.select(
```{ojs}
//| label: regional-chart
-// Regional distribution chart
+// Regional distribution chart (data-driven from H3 clusters)
regional_chart = {
- // Validate that regional_data is an array
if (!Array.isArray(regional_data)) {
return html`
Error: Regional data is not available
`;
}
-
- // Aggregate the regional data by region like we do for source data
+
+ // Aggregate H3 cell data by discovered region
const regionTotals = d3.rollup(
- regional_data,
- v => d3.sum(v, d => d.sample_count),
+ regional_data,
+ v => d3.sum(v, d => d.sample_count),
d => d.region
);
-
+
const aggregatedData = Array.from(regionTotals, ([region, total]) => ({
region: region,
sample_count: total
})).sort((a, b) => b.sample_count - a.sample_count);
-
+
return Plot.plot({
- title: `Sample Distribution by Region (${aggregatedData.length} regions)`,
+ title: `Sample Distribution by Region (H3-derived, ${aggregatedData.length} regions)`,
width: 700,
height: 300,
marginLeft: 120,
@@ -527,7 +592,7 @@ regional_chart = {
}),
Plot.text(aggregatedData, {
x: "sample_count",
- y: "region",
+ y: "region",
text: d => d3.format("~s")(d.sample_count),
dx: 10,
textAnchor: "start"
@@ -714,42 +779,76 @@ Explore the distribution of material categories across different sources.
```{ojs}
//| label: material-analysis
-// Get top material categories by source
+// Material data: use pre-computed facet summaries for instant results
+// Falls back to full-scan if summaries unavailable
material_data = {
- const result = await db.query(`
- SELECT
- source_collection,
- has_material_category,
- count(*) as category_count
- FROM isamples_data
- WHERE has_material_category IS NOT NULL
- GROUP BY source_collection, has_material_category
- ORDER BY source_collection, category_count DESC
- `);
- // Convert BigInt values to Numbers
- return result.toArray().map(row => ({
- ...row,
- category_count: Number(row.category_count)
- }));
+ try {
+ // Try pre-computed summaries first (2KB, instant)
+ const result = await db.query(`
+ SELECT
+ facet_value as has_material_category,
+ 'ALL' as source_collection,
+ count as category_count
+ FROM read_parquet('${facet_summaries_url}')
+ WHERE facet_type = 'material'
+ ORDER BY count DESC
+ `);
+ return result.toArray().map(row => ({
+ ...row,
+ category_count: Number(row.category_count)
+ }));
+ } catch (e) {
+ console.warn("Facet summaries unavailable, falling back to full scan:", e.message);
+ const result = await db.query(`
+ SELECT
+ source_collection,
+ has_material_category,
+ count(*) as category_count
+ FROM isamples_data
+ WHERE has_material_category IS NOT NULL
+ GROUP BY source_collection, has_material_category
+ ORDER BY source_collection, category_count DESC
+ `);
+ return result.toArray().map(row => ({
+ ...row,
+ category_count: Number(row.category_count)
+ }));
+ }
}
-// Get top 10 categories overall
+// Top categories from pre-computed summaries (instant)
top_categories = {
- const result = await db.query(`
- SELECT
- has_material_category,
- count(*) as total_count
- FROM isamples_data
- WHERE has_material_category IS NOT NULL
- GROUP BY has_material_category
- ORDER BY total_count DESC
- LIMIT 10
- `);
- // Convert BigInt values to Numbers
- return result.toArray().map(row => ({
- ...row,
- total_count: Number(row.total_count)
- }));
+ try {
+ const result = await db.query(`
+ SELECT
+ facet_value as has_material_category,
+ count as total_count
+ FROM read_parquet('${facet_summaries_url}')
+ WHERE facet_type = 'material'
+ ORDER BY count DESC
+ LIMIT 10
+ `);
+ return result.toArray().map(row => ({
+ ...row,
+ total_count: Number(row.total_count)
+ }));
+ } catch (e) {
+ console.warn("Facet summaries unavailable, falling back to full scan:", e.message);
+ const result = await db.query(`
+ SELECT
+ has_material_category,
+ count(*) as total_count
+ FROM isamples_data
+ WHERE has_material_category IS NOT NULL
+ GROUP BY has_material_category
+ ORDER BY total_count DESC
+ LIMIT 10
+ `);
+ return result.toArray().map(row => ({
+ ...row,
+ total_count: Number(row.total_count)
+ }));
+ }
}
```