diff --git a/_quarto.yml b/_quarto.yml index 3fe9ffa..1ac7b40 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -54,6 +54,8 @@ website: href: tutorials/zenodo_isamples_analysis.qmd - text: "3D Globe Visualization" href: tutorials/parquet_cesium_isamples_wide.qmd + - text: "Progressive Globe (H3 + Samples)" + href: tutorials/progressive_globe.qmd - text: "Technical: Narrow vs Wide" href: tutorials/narrow_vs_wide_performance.qmd diff --git a/tutorials/isamples_explorer.qmd b/tutorials/isamples_explorer.qmd index 7b85ab8..5939d9d 100644 --- a/tutorials/isamples_explorer.qmd +++ b/tutorials/isamples_explorer.qmd @@ -12,7 +12,7 @@ Search and explore **6.7 million physical samples** from scientific collections ::: {.callout-note} ### Serverless Architecture -This app queries a ~280 MB Parquet file directly in your browser using DuckDB-WASM. No server required! +This app uses a **two-tier loading strategy**: a 2KB pre-computed summary loads instantly for facet counts (source, material, context, specimen type), while the full ~280 MB Parquet file is only queried when drilling into records. All powered by DuckDB-WASM in your browser -- no server required! ::: ## Setup @@ -28,6 +28,9 @@ duckdbModule = import("https://cdn.jsdelivr.net/npm/@duckdb/duckdb-wasm@1.28.0/+ // Data source configuration parquet_url = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet" +// Pre-computed facet summaries (2KB - loads instantly) +facet_summaries_url = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_facet_summaries.parquet" + // Source color scheme (consistent with iSamples conventions) SOURCE_COLORS = ({ 'SESAR': '#3366CC', // Blue @@ -79,14 +82,18 @@ viewof searchInput = Inputs.text({ ### Filters +```{ojs} +facetSummariesWarning +``` + **Source** ```{ojs} //| code-fold: true -// Source checkboxes with counts +// Source checkboxes with counts - uses pre-computed summaries for instant load viewof sourceCheckboxes = { - // Get source counts based on current search - const counts = await sourceCounts; + // Use pre-computed facet summaries (instant) instead of scanning full parquet + const counts = facetsByType.source; const options = counts.map(r => r.value); return Inputs.checkbox(options, { @@ -104,6 +111,69 @@ viewof sourceCheckboxes = { } ``` +**Material** + +```{ojs} +//| code-fold: true +// Material filter - loaded from pre-computed summaries +viewof materialCheckboxes = { + const counts = facetsByType.material; + const options = counts.map(r => r.value); + return Inputs.checkbox(options, { + value: [], + format: (x) => { + const r = counts.find(s => s.value === x); + const count = r ? Number(r.count).toLocaleString() : "0"; + return html` + ${x} (${count}) + `; + } + }); +} +``` + +**Sampled Feature** + +```{ojs} +//| code-fold: true +// Context filter - loaded from pre-computed summaries +viewof contextCheckboxes = { + const counts = facetsByType.context; + const options = counts.map(r => r.value); + return Inputs.checkbox(options, { + value: [], + format: (x) => { + const r = counts.find(s => s.value === x); + const count = r ? Number(r.count).toLocaleString() : "0"; + return html` + ${x} (${count}) + `; + } + }); +} +``` + +**Specimen Type** + +```{ojs} +//| code-fold: true +// Object type filter - loaded from pre-computed summaries +viewof objectTypeCheckboxes = { + const counts = facetsByType.object_type; + const options = counts.map(r => r.value); + return Inputs.checkbox(options, { + value: [], + format: (x) => { + const r = counts.find(s => s.value === x); + const count = r ? Number(r.count).toLocaleString() : "0"; + return html` + ${x} (${count}) + `; + } + }); +} +``` + ```{ojs} //| code-fold: true html`Clear All Filters` @@ -131,6 +201,9 @@ viewof maxSamples = Inputs.range([1000, 100000], { const params = new URLSearchParams(); if (searchInput) params.set("q", searchInput); if (sourceCheckboxes?.length) params.set("sources", sourceCheckboxes.join(",")); + if (materialCheckboxes?.length) params.set("material", materialCheckboxes.join(",")); + if (contextCheckboxes?.length) params.set("context", contextCheckboxes.join(",")); + if (objectTypeCheckboxes?.length) params.set("object_type", objectTypeCheckboxes.join(",")); if (viewMode !== "globe") params.set("view", viewMode); const newUrl = params.toString() ? `?${params.toString()}` : window.location.pathname; @@ -264,7 +337,50 @@ async function runQuery(sql) { ```{ojs} //| code-fold: true -// Build WHERE clause from current filters +// Tier 1: Load pre-computed facet summaries (2KB, instant) +facetSummaries = { + facetSummariesError = null; + try { + const rows = await runQuery(`SELECT * FROM read_parquet('${facet_summaries_url}')`); + return rows; + } catch (e) { + console.error("Facet summaries load error:", e); + facetSummariesError = e; + return []; + } +} + +``` + +```{ojs} +//| code-fold: true +facetSummariesWarning = { + if (!facetSummariesError) return null; + return html`
+ Facet summaries failed to load. Filter counts may be missing. Try refreshing. +
`; +} + +// Extract facet counts by type from pre-computed summaries +facetsByType = { + const grouped = { source: [], material: [], context: [], object_type: [] }; + for (const row of facetSummaries) { + const ft = row.facet_type; + if (grouped[ft]) { + grouped[ft].push({ value: row.facet_value, count: Number(row.count), scheme: row.scheme }); + } + } + // Sort each by count descending + for (const key of Object.keys(grouped)) { + grouped[key].sort((a, b) => b.count - a.count); + } + return grouped; +} +``` + +```{ojs} +//| code-fold: true +// Build WHERE clause from current filters (Tier 2: queries full parquet only when filtering) whereClause = { const conditions = [ "otype = 'MaterialSampleRecord'", @@ -288,40 +404,36 @@ whereClause = { conditions.push(`n IN (${sourceList})`); } + // Material filter + const materials = Array.from(materialCheckboxes || []); + if (materials.length > 0) { + const matList = materials.map(m => `'${m.replace(/'/g, "''")}'`).join(", "); + conditions.push(`has_material_category IN (${matList})`); + } + + // Context (sampled feature) filter + const contexts = Array.from(contextCheckboxes || []); + if (contexts.length > 0) { + const ctxList = contexts.map(c => `'${c.replace(/'/g, "''")}'`).join(", "); + conditions.push(`has_context_category IN (${ctxList})`); + } + + // Object type (specimen type) filter + const objectTypes = Array.from(objectTypeCheckboxes || []); + if (objectTypes.length > 0) { + const otList = objectTypes.map(o => `'${o.replace(/'/g, "''")}'`).join(", "); + conditions.push(`has_specimen_category IN (${otList})`); + } + return conditions.join(" AND "); } ``` ```{ojs} //| code-fold: true -// Get source facet counts (respects text search but not source filter) -sourceCounts = { - let baseWhere = "otype = 'MaterialSampleRecord' AND latitude IS NOT NULL"; - - if (searchInput?.trim()) { - const term = searchInput.trim().replace(/'/g, "''"); - baseWhere += ` AND ( - label ILIKE '%${term}%' - OR description ILIKE '%${term}%' - OR CAST(place_name AS VARCHAR) ILIKE '%${term}%' - )`; - } - - const query = ` - SELECT n as value, COUNT(*) as count - FROM samples - WHERE ${baseWhere} - GROUP BY n - ORDER BY count DESC - `; - - try { - return await runQuery(query); - } catch (e) { - console.error("Facet query error:", e); - return []; - } -} +// Source counts now come from pre-computed facet summaries (Tier 1) +// No longer scans the full parquet file on every page load +sourceCounts = facetsByType.source ``` ```{ojs} diff --git a/tutorials/narrow_vs_wide_performance.qmd b/tutorials/narrow_vs_wide_performance.qmd index bfa1621..da5fa03 100644 --- a/tutorials/narrow_vs_wide_performance.qmd +++ b/tutorials/narrow_vs_wide_performance.qmd @@ -1,6 +1,6 @@ --- -title: "Narrow vs Wide Schema Performance Comparison" -categories: [parquet, performance, benchmarking] +title: "Narrow vs Wide vs H3 Schema Performance Comparison" +categories: [parquet, performance, benchmarking, h3] format: html: code-fold: true @@ -8,20 +8,23 @@ format: toc-depth: 3 --- -This page benchmarks the performance difference between **narrow** and **wide** parquet schema formats when accessing data "over the wire" via HTTP range requests in DuckDB-WASM. +This page benchmarks the performance difference between **narrow**, **wide**, and **wide + H3** parquet schema formats when accessing data "over the wire" via HTTP range requests in DuckDB-WASM. ## Introduction ### What are Narrow vs Wide Schemas? -The iSamples property graph data can be serialized in two different parquet formats: +The iSamples property graph data can be serialized in three different parquet formats: | Format | Description | File Size | Row Count | Sources | |--------|-------------|-----------|-----------|---------| | **Narrow** | Stores relationships as separate edge rows (`otype='_edge_'`) | ~850 MB | ~106M rows | All 4 sources | -| **Wide** | Stores relationships as `p__*` columns on entity rows | ~280 MB | ~20M rows | All 4 sources | +| **Wide** | Stores relationships as `p__*` columns on entity rows | ~278 MB | ~20M rows | All 4 sources | +| **Wide + H3** | Wide format + pre-computed H3 spatial indices (`h3_res4/6/8`) | ~292 MB | ~20M rows | All 4 sources | -Both formats represent the **same underlying data** (SESAR, OpenContext, GEOME, Smithsonian) with identical semantics, but the wide format is optimized for analytical queries by eliminating edge rows. +All three formats represent the **same underlying data** (SESAR, OpenContext, GEOME, Smithsonian) with identical semantics. The wide format eliminates edge rows; the H3 variant adds pre-computed hexagonal spatial indices for accelerated geospatial queries. + +A **facet summaries** file (2KB) provides pre-aggregated counts for instant facet lookups. **Data source**: Cloudflare R2 (updated January 2026) @@ -63,6 +66,8 @@ import { DuckDBClient } from "https://cdn.jsdelivr.net/npm/@observablehq/duckdb@ // Updated 2026-01-14: Using Zenodo narrow/wide files on Cloudflare R2 narrowUrl = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202512_narrow.parquet" wideUrl = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet" +wideH3Url = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide_h3.parquet" +summariesUrl = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_facet_summaries.parquet" ``` ### Environment Info @@ -110,7 +115,7 @@ viewof runBenchmarks = Inputs.button("Run All Benchmarks", { ```{ojs} //| echo: false // Initialize databases only when button is clicked (lazy loading) -// Returns { narrow, wide } or null if not yet clicked +// Returns { narrow, wide, wideH3 } or null if not yet clicked initDatabases = { if (runBenchmarks < 1) return null; @@ -127,8 +132,11 @@ initDatabases = { const wideDb = await DuckDBClient.of(); await wideDb.query(`CREATE VIEW wide AS SELECT * FROM read_parquet('${wideUrl}')`); + const wideH3Db = await DuckDBClient.of(); + await wideH3Db.query(`CREATE VIEW wide_h3 AS SELECT * FROM read_parquet('${wideH3Url}')`); + if (loadingDiv) loadingDiv.style.display = 'none'; - return { narrow: narrowDb, wide: wideDb }; + return { narrow: narrowDb, wide: wideDb, wideH3: wideH3Db }; } catch (e) { const errorMsg = `Failed to initialize databases: ${e.message}. This may be due to network issues or CORS restrictions.`; if (errorDiv) { @@ -142,6 +150,7 @@ initDatabases = { dbNarrow = initDatabases && !initDatabases.error ? initDatabases.narrow : null dbWide = initDatabases && !initDatabases.error ? initDatabases.wide : null +dbWideH3 = initDatabases && !initDatabases.error ? initDatabases.wideH3 : null ``` ## Data Validity Check @@ -155,7 +164,7 @@ Checking data validity... ```{ojs} validityCheck = { // Only run when button clicked AND databases are initialized - if (runBenchmarks < 1 || !dbNarrow || !dbWide) return null; + if (runBenchmarks < 1 || !dbNarrow || !dbWide || !dbWideH3) return null; const loadingDiv = document.getElementById('loading_validity'); const errorDiv = document.getElementById('error_display'); @@ -170,6 +179,10 @@ validityCheck = { const wideCount = await dbWide.query(`SELECT COUNT(*) as cnt FROM wide`); const wideTotal = wideCount[0].cnt; + // Count rows in wide+H3 + const wideH3Count = await dbWideH3.query(`SELECT COUNT(*) as cnt FROM wide_h3`); + const wideH3Total = wideH3Count[0].cnt; + // Count entity types in narrow (excluding edges) const narrowEntities = await dbNarrow.query(` SELECT COUNT(*) as cnt FROM narrow @@ -177,7 +190,7 @@ validityCheck = { `); const narrowEntityCount = narrowEntities[0].cnt; - // Count samples in both + // Count samples in all three const narrowSamples = await dbNarrow.query(` SELECT COUNT(*) as cnt FROM narrow WHERE otype = 'MaterialSampleRecord' @@ -188,13 +201,21 @@ validityCheck = { WHERE otype = 'MaterialSampleRecord' `); + const wideH3Samples = await dbWideH3.query(` + SELECT COUNT(*) as cnt FROM wide_h3 + WHERE otype = 'MaterialSampleRecord' + `); + return { narrowTotal: narrowTotal, wideTotal: wideTotal, + wideH3Total: wideH3Total, narrowEntities: narrowEntityCount, narrowSamples: narrowSamples[0].cnt, wideSamples: wideSamples[0].cnt, + wideH3Samples: wideH3Samples[0].cnt, sampleMatch: narrowSamples[0].cnt === wideSamples[0].cnt + && wideSamples[0].cnt === wideH3Samples[0].cnt }; } catch (e) { if (errorDiv) { @@ -221,12 +242,14 @@ validityCheck ? (validityCheck.error ? html` + - + +
Narrow total rows:${validityCheck.narrowTotal.toLocaleString()}
Wide total rows:${validityCheck.wideTotal.toLocaleString()}
Wide+H3 total rows:${validityCheck.wideH3Total.toLocaleString()}
Narrow entities (non-edge):${validityCheck.narrowEntities.toLocaleString()}
Narrow samples:${validityCheck.narrowSamples.toLocaleString()}
Wide samples:${validityCheck.wideSamples.toLocaleString()}
Sample count match:${validityCheck.sampleMatch ? '✅ Yes' : '❌ No'}
Wide+H3 samples:${validityCheck.wideH3Samples.toLocaleString()}
Sample count match (all 3):${validityCheck.sampleMatch ? '✅ Yes' : '❌ No'}
-

The wide schema has ~79% fewer rows because edge rows are eliminated and stored as columns.

+

The wide schema has ~79% fewer rows because edge rows are eliminated. The H3 variant adds 3 spatial index columns (h3_res4/6/8).

`) : html`

Click "Run All Benchmarks" to check data validity

` ``` @@ -635,22 +658,265 @@ benchmark3 ? (benchmark3.error ? html` `) : html`

Waiting for benchmark...

` ``` +## Benchmark 4: Geospatial Bounding Box Query + +This benchmark counts samples within the western United States (lat 32-49, lon -125 to -104), comparing lat/lon filtering on wide format versus H3-cell-based filtering on the H3-indexed file. + + + +```{ojs} +benchmark4 = { + if (runBenchmarks < 1 || !benchmark3 || benchmark3.error) return null; + + const loadingDiv = document.getElementById('loading_b4'); + const errorDiv = document.getElementById('error_display'); + if (loadingDiv) loadingDiv.hidden = false; + + // Wide: baseline lat/lon bounding box + const wideQuery = ` + SELECT COUNT(*) as cnt FROM wide + WHERE otype = 'MaterialSampleRecord' + AND latitude BETWEEN 32 AND 49 + AND longitude BETWEEN -125 AND -104 + `; + + // Wide+H3: H3-accelerated — first identify cells in the bbox, then filter by cells + const h3Query = ` + WITH cells AS ( + SELECT DISTINCT h3_res4 FROM wide_h3 + WHERE latitude BETWEEN 32 AND 49 + AND longitude BETWEEN -125 AND -104 + AND otype = 'MaterialSampleRecord' + ) + SELECT COUNT(*) as cnt FROM wide_h3 + WHERE h3_res4 IN (SELECT h3_res4 FROM cells) + AND otype = 'MaterialSampleRecord' + `; + + const runs = 3; + + try { + const wideTimes = []; + for (let i = 0; i < runs; i++) { + const start = performance.now(); + await dbWide.query(wideQuery); + wideTimes.push(performance.now() - start); + } + + const h3Times = []; + for (let i = 0; i < runs; i++) { + const start = performance.now(); + await dbWideH3.query(h3Query); + h3Times.push(performance.now() - start); + } + + const median = arr => { + const sorted = [...arr].sort((a, b) => a - b); + if (sorted.length === 2) return (sorted[0] + sorted[1]) / 2; + return sorted[Math.floor(sorted.length / 2)]; + }; + const warmMedian = arr => { + if (arr.length <= 1) return arr[0] || 0; + return median(arr.slice(1)); + }; + + const wideMedian = warmMedian(wideTimes); + const h3Median = warmMedian(h3Times); + + return { + name: "Geospatial BBox (Western US)", + wideCold: wideTimes[0], + wideMedian: wideMedian, + wideAll: wideTimes, + h3Cold: h3Times[0], + h3Median: h3Median, + h3All: h3Times, + speedup: wideMedian / h3Median + }; + } catch (e) { + if (errorDiv) { + errorDiv.textContent = `Benchmark 4 failed: ${e.message}`; + errorDiv.style.display = 'block'; + } + return { error: e.message }; + } finally { + if (loadingDiv) loadingDiv.hidden = true; + } +} +``` + +```{ojs} +//| echo: false +benchmark4 ? (benchmark4.error ? html` +
+

Benchmark 4 Error

+

Benchmark 4 failed: ${benchmark4.error}

+
+` : html` +
+

${benchmark4.name}

+ + + + + + + + + + + + + + + + + + + +
SchemaCold (1st run)Warm (median)All runs
Wide (lat/lon)${benchmark4.wideCold.toFixed(0)} ms${benchmark4.wideMedian.toFixed(0)} ms${benchmark4.wideAll.map(t => t.toFixed(0)).join(', ')} ms
Wide+H3 (cell filter)${benchmark4.h3Cold.toFixed(0)} ms${benchmark4.h3Median.toFixed(0)} ms${benchmark4.h3All.map(t => t.toFixed(0)).join(', ')} ms
+

Speedup: ${benchmark4.speedup.toFixed(2)}x (H3 is ${benchmark4.speedup > 1 ? 'faster' : 'slower'})

+
+`) : html`

Waiting for benchmark...

` +``` + +## Benchmark 5: Facet Aggregation — Full Scan vs Pre-computed Summary + +This benchmark compares full-scan source aggregation on the wide file versus a pre-computed 2KB facet summary file. + + + +```{ojs} +benchmark5 = { + if (runBenchmarks < 1 || !benchmark4 || benchmark4.error) return null; + + const loadingDiv = document.getElementById('loading_b5'); + const errorDiv = document.getElementById('error_display'); + if (loadingDiv) loadingDiv.hidden = false; + + // Full scan: aggregate source counts from full wide file + const fullScanQuery = ` + SELECT n, COUNT(*) as cnt FROM wide + WHERE otype = 'MaterialSampleRecord' + GROUP BY n + `; + + // Pre-computed: read from 2KB summary file + const summaryQuery = ` + SELECT facet_value, count FROM read_parquet('${summariesUrl}') + WHERE facet_type = 'source' + `; + + const runs = 3; + + try { + const fullScanTimes = []; + for (let i = 0; i < runs; i++) { + const start = performance.now(); + await dbWide.query(fullScanQuery); + fullScanTimes.push(performance.now() - start); + } + + const summaryTimes = []; + for (let i = 0; i < runs; i++) { + const start = performance.now(); + await dbWide.query(summaryQuery); + summaryTimes.push(performance.now() - start); + } + + const median = arr => { + const sorted = [...arr].sort((a, b) => a - b); + if (sorted.length === 2) return (sorted[0] + sorted[1]) / 2; + return sorted[Math.floor(sorted.length / 2)]; + }; + const warmMedian = arr => { + if (arr.length <= 1) return arr[0] || 0; + return median(arr.slice(1)); + }; + + const fullScanMedian = warmMedian(fullScanTimes); + const summaryMedian = warmMedian(summaryTimes); + + return { + name: "Facet Aggregation (full scan vs summary)", + fullScanCold: fullScanTimes[0], + fullScanMedian: fullScanMedian, + fullScanAll: fullScanTimes, + summaryCold: summaryTimes[0], + summaryMedian: summaryMedian, + summaryAll: summaryTimes, + speedup: fullScanMedian / summaryMedian + }; + } catch (e) { + if (errorDiv) { + errorDiv.textContent = `Benchmark 5 failed: ${e.message}`; + errorDiv.style.display = 'block'; + } + return { error: e.message }; + } finally { + if (loadingDiv) loadingDiv.hidden = true; + } +} +``` + +```{ojs} +//| echo: false +benchmark5 ? (benchmark5.error ? html` +
+

Benchmark 5 Error

+

Benchmark 5 failed: ${benchmark5.error}

+
+` : html` +
+

${benchmark5.name}

+ + + + + + + + + + + + + + + + + + + +
ApproachCold (1st run)Warm (median)All runs
Full scan (280MB)${benchmark5.fullScanCold.toFixed(0)} ms${benchmark5.fullScanMedian.toFixed(0)} ms${benchmark5.fullScanAll.map(t => t.toFixed(0)).join(', ')} ms
Pre-computed (2KB)${benchmark5.summaryCold.toFixed(0)} ms${benchmark5.summaryMedian.toFixed(0)} ms${benchmark5.summaryAll.map(t => t.toFixed(0)).join(', ')} ms
+

Speedup: ${benchmark5.speedup.toFixed(2)}x (pre-computed is ${benchmark5.speedup > 1 ? 'faster' : 'slower'})

+
+`) : html`

Waiting for benchmark...

` +``` + ## Results Summary ```{ojs} //| echo: false allResults = { - if (!benchmark1 || !benchmark2 || !benchmark3) return null; + if (!benchmark1 || !benchmark2 || !benchmark3 || !benchmark4 || !benchmark5) return null; - const results = [benchmark1, benchmark2, benchmark3]; - const successful = results.filter(r => r && !r.error); - const avgSpeedup = successful.length - ? successful.reduce((sum, r) => sum + r.speedup, 0) / successful.length + // Schema comparison benchmarks (narrow vs wide) + const schemaResults = [benchmark1, benchmark2, benchmark3]; + const schemaSuccessful = schemaResults.filter(r => r && !r.error); + const avgSchemaSpeedup = schemaSuccessful.length + ? schemaSuccessful.reduce((sum, r) => sum + r.speedup, 0) / schemaSuccessful.length : null; return { - benchmarks: results, - avgSpeedup: avgSpeedup + schemaBenchmarks: schemaResults, + geoBenchmark: benchmark4, + facetBenchmark: benchmark5, + avgSchemaSpeedup: avgSchemaSpeedup }; } @@ -658,6 +924,7 @@ allResults ? html`

Summary Results

+

Schema Comparison (Narrow vs Wide)

@@ -668,7 +935,7 @@ allResults ? html` -${allResults.benchmarks.map(b => { +${allResults.schemaBenchmarks.map(b => { const hasError = !b || b.error; return html` @@ -683,26 +950,76 @@ ${allResults.benchmarks.map(b => { - + + + +
Average - -${allResults.avgSpeedup != null ? `${allResults.avgSpeedup.toFixed(2)}x` : 'N/A'}${allResults.avgSchemaSpeedup != null ? `${allResults.avgSchemaSpeedup.toFixed(2)}x` : 'N/A'}
+ +

Geospatial Query (Wide vs Wide+H3)

+ + + + + + + + + +${allResults.geoBenchmark && !allResults.geoBenchmark.error ? html` + + + + + + + + + +` : html``} + +
ApproachWarm (ms)Speedup
Wide (lat/lon bbox)${allResults.geoBenchmark.wideMedian.toFixed(0)}baseline
Wide+H3 (cell filter)${allResults.geoBenchmark.h3Median.toFixed(0)}${allResults.geoBenchmark.speedup.toFixed(2)}x
Error: ${allResults.geoBenchmark?.error || 'N/A'}
+ +

Facet Aggregation (Full Scan vs Pre-computed Summary)

+ + + + + + + + + +${allResults.facetBenchmark && !allResults.facetBenchmark.error ? html` + + + + + + + + + + +` : html``}
ApproachWarm (ms)Speedup
Full scan (280MB wide)${allResults.facetBenchmark.fullScanMedian.toFixed(0)}baseline
Pre-computed summary (2KB)${allResults.facetBenchmark.summaryMedian.toFixed(0)}${allResults.facetBenchmark.speedup.toFixed(2)}x
Error: ${allResults.facetBenchmark?.error || 'N/A'}

Key Findings

Recommendation

-

For browser-based analysis with DuckDB-WASM, the wide format is recommended for: +

For browser-based analysis with DuckDB-WASM:

-

` : html`
@@ -736,7 +1053,18 @@ ${allResults.benchmarks.map(b => { {otype: 'MaterialSampleRecord', p__produced_by: [456], p__has_material_category: [789]} ``` -This eliminates ~9M edge rows, resulting in the 60% file size reduction. +**Wide+H3 schema** adds pre-computed spatial indices: +```sql +-- Same as wide, plus H3 hexagonal index columns +{..., h3_res4: 595536348953485311, h3_res6: 604265133842685951, h3_res8: 613003918731886591} +``` + +The wide format eliminates ~9M edge rows (60% file size reduction). The H3 variant adds ~14MB for spatial index columns that enable cell-based geospatial filtering. + +**Facet summaries** (2KB) pre-compute common aggregations: +```sql +{facet_type: 'source', facet_value: 'SESAR', scheme: null, count: 4600000} +``` ## See Also diff --git a/tutorials/parquet_cesium_isamples_wide.qmd b/tutorials/parquet_cesium_isamples_wide.qmd index e123540..e13de7e 100644 --- a/tutorials/parquet_cesium_isamples_wide.qmd +++ b/tutorials/parquet_cesium_isamples_wide.qmd @@ -3,18 +3,20 @@ title: Using Cesium for display of remote parquet (iSamples Wide Format). categories: [parquet, spatial, recipe, wide, isamples] --- -This page renders points from the **full iSamples wide-format** parquet file (all sources: SESAR, OpenContext, GEOME, Smithsonian) on Cesium using point primitives. +This page renders points from the **full iSamples wide-format** parquet file (all sources: SESAR, OpenContext, GEOME, Smithsonian) on Cesium using point primitives, with **zoom-adaptive H3 clustering** for fast initial load. ::: {.callout-note} -## iSamples Full Dataset (Wide Format) +## iSamples Full Dataset (Wide Format + H3 Spatial Index) This page uses the **iSamples combined dataset** (Jan 2026) which includes: - **6.7M MaterialSampleRecords** from all iSamples sources - **Source breakdown**: SESAR (4.6M), OpenContext (1M), GEOME (605K), Smithsonian (322K) -- **~280 MB** wide format (vs ~850 MB narrow) - 67% smaller -- **20M total rows** (all entity types, no edge rows) -- **47 columns** with flattened latitude/longitude (direct column access, no JSON parsing) +- **~292 MB** wide format with H3 indices (vs ~850 MB narrow) - 66% smaller +- **H3 spatial index**: Pre-computed `h3_res4`, `h3_res6`, `h3_res8` columns for zoom-adaptive clustering +- **Clustered view**: At high altitude shows res4 clusters, medium shows res6, close-up shows res8 +- **Toggle**: Switch between clustered (fast) and all-points (detailed) views +- **Color-coded**: Points/clusters colored by dominant data source ::: @@ -42,8 +44,8 @@ Cesium.Ion.defaultAccessToken = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJqdGkiOi ```{ojs} //| echo: false viewof parquet_path = Inputs.text({ - label:"Source (iSamples Wide Format)", - value:"https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet", + label:"Source (iSamples Wide Format + H3)", + value:"https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide_h3.parquet", placeholder: "URL or file:///path/to/file.parquet", width:"100%", submit:true @@ -82,6 +84,34 @@ viewof classifyTrigger = { classifyDots = classifyTrigger > 0 ? classifyTrigger : null ``` +```{ojs} +//| echo: false +// View mode toggle: clustered (H3 LOD) vs all points +viewof viewModeToggle = Inputs.radio(["clustered", "all points"], { + label: "View Mode:", + value: "clustered" +}) +``` + +```{ojs} +//| echo: false +// Source color scheme for cluster coloring +CLUSTER_COLORS = ({ + 'SESAR': '#3366CC', + 'OPENCONTEXT': '#DC3912', + 'GEOME': '#109618', + 'SMITHSONIAN': '#FF9900', + 'default': '#808080' +}) + +// H3 resolution based on camera height +function getH3ResForHeight(height) { + if (height > 5000000) return 4; // Continental view + if (height > 500000) return 6; // Regional view + return 8; // Local view +} +``` + ::: {.callout-tip collapse="true"} #### Using a local cached file for faster performance @@ -156,136 +186,244 @@ async function loadData(query, params = [], waiting_id = null, key = "default") } } +// Query H3 clusters at a given resolution +async function queryH3Clusters(h3Res) { + const col = `h3_res${h3Res}`; + const query = ` + SELECT + ${col} as hex_id, + COUNT(*) as n, + AVG(latitude) as lat, + AVG(longitude) as lon, + MODE(source) as dominant_source + FROM nodes + WHERE otype = 'MaterialSampleRecord' + AND ${col} IS NOT NULL + AND latitude IS NOT NULL + AND longitude IS NOT NULL + GROUP BY ${col} + `; + return await loadData(query, [], "loading_1", "clusters"); +} + +// Render clustered points on the globe +async function renderClusters(clusters) { + content.points.removeAll(); + if (!clusters || clusters.length === 0) return; + + const scalar = new Cesium.NearFarScalar(1.5e2, 2, 8.0e6, 0.5); + const maxCount = Math.max(...clusters.map(c => c.n)); + + for (const cluster of clusters) { + const source = cluster.dominant_source || 'default'; + const colorHex = CLUSTER_COLORS[source] || CLUSTER_COLORS.default; + const color = Cesium.Color.fromCssColorString(colorHex); + // Size proportional to log of count (range: 4-20px) + const size = Math.max(4, Math.min(20, 4 + Math.log(Number(cluster.n)) * 2.5)); + + content.points.add({ + id: `cluster_${cluster.hex_id}_n${cluster.n}_${source}`, + position: Cesium.Cartesian3.fromDegrees( + cluster.lon, + cluster.lat, + 0 + ), + pixelSize: size, + color: color, + scaleByDistance: scalar, + }); + } + content.enableTracking(); +} + locations = { // Performance telemetry performance.mark('locations-start'); - // Get loading indicator element for progress updates const loadingDiv = document.getElementById('loading_1'); if (loadingDiv) { loadingDiv.hidden = false; - loadingDiv.innerHTML = 'Loading geocodes...'; + loadingDiv.innerHTML = 'Loading...'; } - // Fast query: just get all distinct geocodes (no classification!) - // The Zenodo wide parquet has latitude/longitude as direct columns (like Eric's OpenContext) - const query = ` - SELECT DISTINCT - pid, - latitude, - longitude - FROM nodes - WHERE otype = 'GeospatialCoordLocation' - AND latitude IS NOT NULL - AND longitude IS NOT NULL - `; - - performance.mark('query-start'); - const data = await loadData(query, [], "loading_1", "locations"); - performance.mark('query-end'); - performance.measure('locations-query', 'query-start', 'query-end'); - const queryTime = performance.getEntriesByName('locations-query')[0].duration; - - // Handle null data (query failed) - if (!data) { - console.error('Query failed - data is null'); - if (loadingDiv) { - loadingDiv.innerHTML = 'Query failed - check console for errors'; - loadingDiv.hidden = false; - } - return []; + // Remove any existing camera change listener to avoid leaks + if (content._cameraChangedHandler) { + content.viewer.camera.changed.removeEventListener(content._cameraChangedHandler); + content._cameraChangedHandler = null; + } + if (content._cameraChangedDebounceTimer) { + clearTimeout(content._cameraChangedDebounceTimer); + content._cameraChangedDebounceTimer = null; } - console.log(`Query executed in ${queryTime.toFixed(0)}ms - retrieved ${data.length} locations`); + if (viewModeToggle === "clustered") { + // Clustered mode: load H3 clusters based on initial zoom level + if (loadingDiv) loadingDiv.innerHTML = 'Loading H3 clusters (res4)...'; - // Clear the existing PointPrimitiveCollection - content.points.removeAll(); + performance.mark('query-start'); + const clusters = await queryH3Clusters(4); + performance.mark('query-end'); + performance.measure('locations-query', 'query-start', 'query-end'); + const queryTime = performance.getEntriesByName('locations-query')[0].duration; - // Single color for all points (blue) - const defaultColor = Cesium.Color.fromCssColorString('#2E86AB'); - const defaultSize = 4; + if (!clusters) { + if (loadingDiv) { + loadingDiv.innerHTML = 'Cluster query failed - check console'; + loadingDiv.hidden = false; + } + return []; + } - // Render points in chunks to keep UI responsive - const CHUNK_SIZE = 500; - const scalar = new Cesium.NearFarScalar(1.5e2, 2, 8.0e6, 0.2); + console.log(`H3 cluster query (res4) in ${queryTime.toFixed(0)}ms - ${clusters.length} clusters`); + + performance.mark('render-start'); + await renderClusters(clusters); + performance.mark('render-end'); + performance.measure('locations-render', 'render-start', 'render-end'); + + if (loadingDiv) loadingDiv.hidden = true; + + // Set up camera change listener for zoom-adaptive LOD + let lastRes = 4; + let debounceTimer = null; + const cameraChangedHandler = () => { + if (viewModeToggle !== "clustered") return; + const height = content.viewer.camera.positionCartographic.height; + const newRes = getH3ResForHeight(height); + if (newRes !== lastRes) { + lastRes = newRes; + clearTimeout(debounceTimer); + debounceTimer = setTimeout(async () => { + const ld = document.getElementById('loading_1'); + if (ld) { ld.hidden = false; ld.innerHTML = `Loading H3 clusters (res${newRes})...`; } + const newClusters = await queryH3Clusters(newRes); + if (newClusters) { + await renderClusters(newClusters); + console.log(`Zoom-adaptive: switched to res${newRes}, ${newClusters.length} clusters`); + } + if (ld) ld.hidden = true; + }, 300); + content._cameraChangedDebounceTimer = debounceTimer; + } + }; + content._cameraChangedHandler = cameraChangedHandler; + content.viewer.camera.changed.addEventListener(cameraChangedHandler); + content.viewer.camera.percentageChanged = 0.1; + + performance.mark('locations-end'); + performance.measure('locations-total', 'locations-start', 'locations-end'); + return clusters; + + } else { + // All points mode: load every geocode (original behavior) + if (loadingDiv) loadingDiv.innerHTML = 'Loading all geocodes...'; + + const query = ` + SELECT DISTINCT + pid, + latitude, + longitude + FROM nodes + WHERE otype = 'GeospatialCoordLocation' + AND latitude IS NOT NULL + AND longitude IS NOT NULL + `; + + performance.mark('query-start'); + const data = await loadData(query, [], "loading_1", "locations"); + performance.mark('query-end'); + performance.measure('locations-query', 'query-start', 'query-end'); + const queryTime = performance.getEntriesByName('locations-query')[0].duration; + + if (!data) { + if (loadingDiv) { + loadingDiv.innerHTML = 'Query failed - check console for errors'; + loadingDiv.hidden = false; + } + return []; + } - performance.mark('render-start'); - for (let i = 0; i < data.length; i += CHUNK_SIZE) { - const chunk = data.slice(i, i + CHUNK_SIZE); - const endIdx = Math.min(i + CHUNK_SIZE, data.length); + console.log(`Query executed in ${queryTime.toFixed(0)}ms - retrieved ${data.length} locations`); - // Update progress indicator - if (loadingDiv) { - const pct = Math.round((endIdx / data.length) * 100); - loadingDiv.innerHTML = `Rendering geocodes... ${endIdx.toLocaleString()}/${data.length.toLocaleString()} (${pct}%)`; - } + content.points.removeAll(); + const defaultColor = Cesium.Color.fromCssColorString('#2E86AB'); + const defaultSize = 4; + const CHUNK_SIZE = 500; + const scalar = new Cesium.NearFarScalar(1.5e2, 2, 8.0e6, 0.2); - // Add points for this chunk - for (const row of chunk) { - content.points.add({ - id: row.pid, - position: Cesium.Cartesian3.fromDegrees( - row.longitude, //longitude - row.latitude, //latitude - 0 //elevation, m - ), - pixelSize: defaultSize, - color: defaultColor, - scaleByDistance: scalar, - }); - } + performance.mark('render-start'); + for (let i = 0; i < data.length; i += CHUNK_SIZE) { + const chunk = data.slice(i, i + CHUNK_SIZE); + const endIdx = Math.min(i + CHUNK_SIZE, data.length); - // Yield to browser between chunks to keep UI responsive - if (i + CHUNK_SIZE < data.length) { - await new Promise(resolve => setTimeout(resolve, 0)); - } - } - performance.mark('render-end'); - performance.measure('locations-render', 'render-start', 'render-end'); - const renderTime = performance.getEntriesByName('locations-render')[0].duration; + if (loadingDiv) { + const pct = Math.round((endIdx / data.length) * 100); + loadingDiv.innerHTML = `Rendering geocodes... ${endIdx.toLocaleString()}/${data.length.toLocaleString()} (${pct}%)`; + } - // Hide loading indicator - if (loadingDiv) { - loadingDiv.hidden = true; - } + for (const row of chunk) { + content.points.add({ + id: row.pid, + position: Cesium.Cartesian3.fromDegrees(row.longitude, row.latitude, 0), + pixelSize: defaultSize, + color: defaultColor, + scaleByDistance: scalar, + }); + } - performance.mark('locations-end'); - performance.measure('locations-total', 'locations-start', 'locations-end'); - const totalTime = performance.getEntriesByName('locations-total')[0].duration; + if (i + CHUNK_SIZE < data.length) { + await new Promise(resolve => setTimeout(resolve, 0)); + } + } + performance.mark('render-end'); + performance.measure('locations-render', 'render-start', 'render-end'); + const renderTime = performance.getEntriesByName('locations-render')[0].duration; - console.log(`Rendering completed in ${renderTime.toFixed(0)}ms`); - console.log(`Total time (query + render): ${totalTime.toFixed(0)}ms`); + if (loadingDiv) loadingDiv.hidden = true; - content.enableTracking(); - return data; + performance.mark('locations-end'); + performance.measure('locations-total', 'locations-start', 'locations-end'); + const totalTime = performance.getEntriesByName('locations-total')[0].duration; + console.log(`Rendering completed in ${renderTime.toFixed(0)}ms, total: ${totalTime.toFixed(0)}ms`); + + content.enableTracking(); + return data; + } } function createShowPrimitive(viewer) { return function(movement) { - // Get the point at the mouse end position const selectPoint = viewer.viewer.scene.pick(movement.endPosition); - // Clear the current selection, if there is one and it is different to the selectPoint + // Clear previous selection if (viewer.currentSelection !== null) { - //console.log(`selected.p ${viewer.currentSelection}`) if (Cesium.defined(selectPoint) && selectPoint !== viewer.currentSelection) { - console.log(`selected.p 2 ${viewer.currentSelection}`) - viewer.currentSelection.primitive.pixelSize = 4; + viewer.currentSelection.primitive.pixelSize = viewer.currentSelection._origSize || 4; viewer.currentSelection.primitive.outlineColor = Cesium.Color.TRANSPARENT; viewer.currentSelection.outlineWidth = 0; viewer.currentSelection = null; } } - // If selectPoint is valid and no currently selected point if (Cesium.defined(selectPoint) && selectPoint.hasOwnProperty("primitive")) { - //console.log(`showPrimitiveId ${selectPoint.id}`); - //const carto = Cesium.Cartographic.fromCartesian(selectPoint.primitive.position) viewer.pointLabel.position = selectPoint.primitive.position; viewer.pointLabel.label.show = true; - //viewer.pointLabel.label.text = `id:${selectPoint.id}, ${carto}`; - viewer.pointLabel.label.text = `${selectPoint.id}`; + + // Parse cluster info from ID (format: cluster__n_) + const id = String(selectPoint.id || ''); + if (id.startsWith('cluster_')) { + const parts = id.split('_'); + const count = parts[2] ? parts[2].replace('n', '') : '?'; + const source = parts.slice(3).join('_') || '?'; + const countNum = Number(count); + const countLabel = Number.isFinite(countNum) ? countNum.toLocaleString() : count; + viewer.pointLabel.label.text = `Cluster: ${countLabel} samples\nSource: ${source}\nCell: ${parts[1]}`; + } else { + viewer.pointLabel.label.text = `${selectPoint.id}`; + } + + selectPoint._origSize = selectPoint.primitive.pixelSize; selectPoint.primitive.pixelSize = 20; selectPoint.primitive.outlineColor = Cesium.Color.YELLOW; selectPoint.primitive.outlineWidth = 3; @@ -556,7 +694,8 @@ selectedSamplesCombined = { } } -md`Retrieved ${pointdata.length} locations from ${parquet_path}.`; +md`Retrieved ${pointdata.length} ${viewModeToggle === "clustered" ? "clusters" : "locations"} from ${parquet_path}. +${viewModeToggle === "clustered" ? "\n*Clustered view: point size reflects sample count, color reflects dominant source. Zoom in/out to change H3 resolution.*" : ""}`; ``` ```{ojs} diff --git a/tutorials/progressive_globe.qmd b/tutorials/progressive_globe.qmd new file mode 100644 index 0000000..ffa49d5 --- /dev/null +++ b/tutorials/progressive_globe.qmd @@ -0,0 +1,876 @@ +--- +title: "Progressive Globe: Instant H3 → Detail on Demand" +categories: [parquet, spatial, h3, performance, isamples] +--- + +Explore **6.7 million material samples** from iSamples — the globe loads instantly with H3 hexagonal aggregates, then refines as you zoom down to individual samples. + +::: {.callout-note collapse="true"} +## How It Works + +1. **Instant** (<1s): Pre-aggregated H3 res4 summary (580 KB) → 38K colored circles +2. **Zoom in**: Automatically switches to res6 (112K) then res8 (176K) clusters +3. **Zoom deeper** (<120 km): Individual sample points from 60 MB lite parquet +4. **Click**: Cluster info or individual sample card with full metadata + +Circle size = log(sample count). Color = dominant data source. +::: + + + + + + +
+
+
+
+
+
Loading...Resolution
+
0Clusters
+
0Samples
+
-Load Time
+
+
+
+ SESAR + OpenContext + GEOME + Smithsonian +
+
+
+ + +
+
+Loading H3 global overview... +
+
+
+
Click a cluster or sample on the globe
+
+
+
+
+ +```{ojs} +//| output: false +Cesium.Ion.defaultAccessToken = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJqdGkiOiIwNzk3NjkyMy1iNGI1LTRkN2UtODRiMy04OTYwYWE0N2M3ZTkiLCJpZCI6Njk1MTcsImlhdCI6MTYzMzU0MTQ3N30.e70dpNzOCDRLDGxRguQCC-tRzGzA-23Xgno5lNgCeB4'; +``` + +```{ojs} +//| echo: false +//| output: false + +// === Constants === +R2_BASE = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev" +h3_res4_url = `${R2_BASE}/isamples_202601_h3_summary_res4.parquet` +h3_res6_url = `${R2_BASE}/isamples_202601_h3_summary_res6.parquet` +h3_res8_url = `${R2_BASE}/isamples_202601_h3_summary_res8.parquet` +lite_url = `${R2_BASE}/isamples_202601_samples_map_lite.parquet` +wide_url = `${R2_BASE}/isamples_202601_wide.parquet` + +SOURCE_COLORS = ({ + SESAR: '#3366CC', OPENCONTEXT: '#DC3912', + GEOME: '#109618', SMITHSONIAN: '#FF9900' +}) +SOURCE_NAMES = ({ + SESAR: 'SESAR', OPENCONTEXT: 'OpenContext', + GEOME: 'GEOME', SMITHSONIAN: 'Smithsonian' +}) + +// === URL State: encode/decode globe state in hash fragment === +function parseNum(val, def, min, max) { + if (val == null) return def; + const n = parseFloat(val); + if (!Number.isFinite(n)) return def; + if (min != null && n < min) return min; + if (max != null && n > max) return max; + return n; +} + +function readHash() { + const params = new URLSearchParams(location.hash.slice(1)); + return { + v: parseInt(params.get('v')) || 0, + lat: parseNum(params.get('lat'), null, -90, 90), + lng: parseNum(params.get('lng'), null, -180, 180), + alt: parseNum(params.get('alt'), null, 100, 40000000), + heading: parseNum(params.get('heading'), 0, 0, 360), + pitch: parseNum(params.get('pitch'), -90, -90, 0), + mode: params.get('mode') || null, + pid: params.get('pid') || null, + }; +} + +function buildHash(v) { + const cam = v.camera; + const carto = cam.positionCartographic; + const params = new URLSearchParams(); + params.set('v', '1'); + params.set('lat', Cesium.Math.toDegrees(carto.latitude).toFixed(4)); + params.set('lng', Cesium.Math.toDegrees(carto.longitude).toFixed(4)); + params.set('alt', Math.round(carto.height).toString()); + const heading = Cesium.Math.toDegrees(cam.heading) % 360; + const pitch = Cesium.Math.toDegrees(cam.pitch); + if (Math.abs(heading) > 1) params.set('heading', heading.toFixed(1)); + if (Math.abs(pitch + 90) > 1) params.set('pitch', pitch.toFixed(1)); + const gs = v._globeState; + if (gs.mode === 'point') params.set('mode', 'point'); + if (gs.selectedPid) params.set('pid', gs.selectedPid); + return '#' + params.toString(); +} + +// === Helpers: update DOM imperatively (no OJS reactivity) === +function updateStats(phase, points, samples, time) { + const s = (id, v) => { const e = document.getElementById(id); if (e) e.textContent = v; }; + s('sPhase', phase); + s('sPoints', points.toLocaleString()); + s('sSamples', samples.toLocaleString()); + s('sTime', time); +} + +function updatePhaseMsg(text, type) { + const m = document.getElementById('phaseMsg'); + if (!m) return; + m.textContent = text; + if (type === 'loading') { m.style.background = '#e3f2fd'; m.style.color = '#1565c0'; } + else { m.style.background = '#e8f5e9'; m.style.color = '#2e7d32'; } +} + +function updateClusterCard(info) { + const el = document.getElementById('clusterSection'); + if (!el) return; + if (!info) { + el.innerHTML = '
Click a cluster or sample on the globe
'; + return; + } + const color = SOURCE_COLORS[info.source] || '#666'; + const name = SOURCE_NAMES[info.source] || info.source; + el.innerHTML = `

Selected Cluster

+
+
+ ${name} + H3 res${info.resolution} +
+
+ ${info.count.toLocaleString()} samples +
+
+ ${info.lat.toFixed(4)}, ${info.lng.toFixed(4)} +
+
`; +} + +function updateSampleCard(sample) { + const el = document.getElementById('clusterSection'); + if (!el) return; + const color = SOURCE_COLORS[sample.source] || '#666'; + const name = SOURCE_NAMES[sample.source] || sample.source; + const placeParts = sample.place_name; + const placeStr = Array.isArray(placeParts) && placeParts.length > 0 + ? placeParts.filter(Boolean).join(' › ') + : ''; + el.innerHTML = `

Sample

+
+
+ ${name} +
+
+ ${sample.label || sample.pid || 'Unnamed'} +
+
+ ${sample.lat.toFixed(5)}, ${sample.lng.toFixed(5)} +
+ ${placeStr ? `
${placeStr}
` : ''} + ${sample.result_time ? `
Date: ${sample.result_time}
` : ''} +
Loading full details...
+
`; +} + +function updateSampleDetail(detail) { + const el = document.getElementById('sampleDetail'); + if (!el) return; + if (!detail) { + el.innerHTML = 'Detail query failed'; + return; + } + const desc = detail.description + ? (detail.description.length > 300 ? detail.description.slice(0, 300) + '...' : detail.description) + : ''; + el.innerHTML = `${desc ? `
${desc}
` : ''} +
+ Open in Analysis Tool → +
`; +} + +function updateSamples(samples) { + const el = document.getElementById('samplesSection'); + if (!el) return; + if (!samples || samples.length === 0) { + el.innerHTML = ''; + return; + } + let h = `

Nearby Samples (${samples.length})

`; + for (const s of samples) { + const color = SOURCE_COLORS[s.source] || '#666'; + const name = SOURCE_NAMES[s.source] || s.source; + const placeParts = s.place_name; + const desc = Array.isArray(placeParts) && placeParts.length > 0 + ? placeParts.filter(Boolean).join(' › ') + : ''; + h += `
+
+ ${s.label || s.pid} + ${name} +
+ ${desc ? `
${desc}
` : ''} +
`; + } + el.innerHTML = h; +} +``` + +```{ojs} +//| echo: false +//| output: false + +// === DuckDB === +db = { + const instance = await DuckDBClient.of(); + return instance; +} +``` + +```{ojs} +//| echo: false +//| output: false + +// === Cesium Viewer (created once, never re-created) === +viewer = { + const v = new Cesium.Viewer("cesiumContainer", { + timeline: false, + animation: false, + baseLayerPicker: false, + fullscreenElement: "cesiumContainer", + terrain: Cesium.Terrain.fromWorldTerrain() + }); + + // URL deep-link state (must be set before globalRect/once block reads it) + v._globeState = { mode: 'cluster', selectedPid: null }; + v._initialHash = readHash(); + v._suppressHashWrite = true; // cleared after zoomWatcher initializes + v._suppressTimer = null; + + const globalRect = Cesium.Rectangle.fromDegrees(-180, -60, 180, 80); + Cesium.Camera.DEFAULT_VIEW_RECTANGLE = globalRect; + Cesium.Camera.DEFAULT_VIEW_FACTOR = 0.5; + const ih = v._initialHash; + const once = () => { + if (ih.lat != null && ih.lng != null) { + v.camera.setView({ + destination: Cesium.Cartesian3.fromDegrees(ih.lng, ih.lat, ih.alt || 20000000), + orientation: { + heading: Cesium.Math.toRadians(ih.heading), + pitch: Cesium.Math.toRadians(ih.pitch) + } + }); + } else { + v.camera.setView({ destination: globalRect }); + } + v.scene.postRender.removeEventListener(once); + }; + v.scene.postRender.addEventListener(once); + + // Two separate point collections: clusters and individual samples + v.h3Points = new Cesium.PointPrimitiveCollection(); + v.scene.primitives.add(v.h3Points); + + v.samplePoints = new Cesium.PointPrimitiveCollection(); + v.scene.primitives.add(v.samplePoints); + v.samplePoints.show = false; // hidden until point mode + + // Hover tooltip — works for both clusters and samples + v.pointLabel = v.entities.add({ + label: { + show: false, showBackground: true, font: "13px monospace", + horizontalOrigin: Cesium.HorizontalOrigin.LEFT, + verticalOrigin: Cesium.VerticalOrigin.BOTTOM, + pixelOffset: new Cesium.Cartesian2(15, 0), + disableDepthTestDistance: Number.POSITIVE_INFINITY, text: "", + } + }); + + new Cesium.ScreenSpaceEventHandler(v.scene.canvas).setInputAction((movement) => { + const picked = v.scene.pick(movement.endPosition); + if (Cesium.defined(picked) && picked.primitive && picked.id) { + v.pointLabel.position = picked.primitive.position; + v.pointLabel.label.show = true; + const meta = picked.id; + if (typeof meta === 'object' && meta.type === 'sample') { + v.pointLabel.label.text = `${meta.label || meta.pid}`; + } else if (typeof meta === 'object' && meta.count) { + v.pointLabel.label.text = `${meta.source}: ${meta.count.toLocaleString()} samples`; + } else { + v.pointLabel.label.text = String(meta); + } + } else { + v.pointLabel.label.show = false; + } + }, Cesium.ScreenSpaceEventType.MOUSE_MOVE); + + // Click handler — routes to cluster card or sample card + new Cesium.ScreenSpaceEventHandler(v.scene.canvas).setInputAction(async (e) => { + const picked = v.scene.pick(e.position); + if (!Cesium.defined(picked) || !picked.primitive || !picked.id) return; + const meta = picked.id; + + if (typeof meta === 'object' && meta.type === 'sample') { + // --- Individual sample click --- + updateSampleCard(meta); + v._globeState.selectedPid = meta.pid; + history.pushState(null, '', buildHash(v)); + // Clear nearby list + const sampEl = document.getElementById('samplesSection'); + if (sampEl) sampEl.innerHTML = ''; + + // Stage 2: lazy-load full description from wide parquet + try { + const detail = await db.query(` + SELECT description + FROM read_parquet('${wide_url}') + WHERE pid = '${meta.pid.replace(/'/g, "''")}' + LIMIT 1 + `); + if (detail && detail.length > 0) { + updateSampleDetail(detail[0]); + } else { + updateSampleDetail({ description: '' }); + } + } catch(err) { + console.error("Detail query failed:", err); + updateSampleDetail(null); + } + + } else if (typeof meta === 'object' && meta.count) { + // --- Cluster click --- + updateClusterCard(meta); + v._globeState.selectedPid = null; + history.pushState(null, '', buildHash(v)); + + const sampEl = document.getElementById('samplesSection'); + if (sampEl) sampEl.innerHTML = '
Loading nearby samples...
'; + + const delta = meta.resolution === 4 ? 2.0 : meta.resolution === 6 ? 0.5 : 0.1; + try { + const samples = await db.query(` + SELECT pid, label, source, latitude, longitude, place_name + FROM read_parquet('${lite_url}') + WHERE latitude BETWEEN ${meta.lat - delta} AND ${meta.lat + delta} + AND longitude BETWEEN ${meta.lng - delta} AND ${meta.lng + delta} + LIMIT 30 + `); + updateSamples(samples); + } catch(err) { + console.error("Sample query failed:", err); + if (sampEl) sampEl.innerHTML = '
Query failed — try again.
'; + } + } + }, Cesium.ScreenSpaceEventType.LEFT_CLICK); + + return v; +} +``` + +```{ojs} +//| echo: false +//| output: false + +// === PHASE 1: Load H3 res4 globally (instant) === +phase1 = { + performance.mark('p1-start'); + + const data = await db.query(` + SELECT h3_cell, sample_count, center_lat, center_lng, + dominant_source, source_count + FROM read_parquet('${h3_res4_url}') + `); + + const scalar = new Cesium.NearFarScalar(1.5e2, 1.5, 8.0e6, 0.5); + let totalSamples = 0; + + for (const row of data) { + const count = row.sample_count; + totalSamples += count; + const size = Math.min(3 + Math.log10(count) * 4, 20); + viewer.h3Points.add({ + id: { count, source: row.dominant_source, lat: row.center_lat, lng: row.center_lng, resolution: 4 }, + position: Cesium.Cartesian3.fromDegrees(row.center_lng, row.center_lat, 0), + pixelSize: size, + color: Cesium.Color.fromCssColorString(SOURCE_COLORS[row.dominant_source] || '#666').withAlpha(0.8), + scaleByDistance: scalar, + }); + } + + performance.mark('p1-end'); + performance.measure('p1', 'p1-start', 'p1-end'); + const elapsed = performance.getEntriesByName('p1').pop().duration; + + updateStats('H3 Res4', data.length, totalSamples, `${(elapsed/1000).toFixed(1)}s`); + updatePhaseMsg(`${data.length.toLocaleString()} clusters, ${totalSamples.toLocaleString()} samples. Zoom in for finer detail.`, 'done'); + console.log(`Phase 1: ${data.length} clusters in ${elapsed.toFixed(0)}ms`); + + return { count: data.length, samples: totalSamples }; +} +``` + +```{ojs} +//| echo: false +//| output: false + +// === Zoom watcher: H3 cluster mode + individual sample point mode === +zoomWatcher = { + if (!phase1) return; + + // --- State --- + let mode = 'cluster'; // 'cluster' or 'point' + let currentRes = 4; + let loading = false; + let requestId = 0; // stale-request guard + + // Hysteresis thresholds to avoid flicker + const ENTER_POINT_ALT = 120000; // 120 km → enter point mode + const EXIT_POINT_ALT = 180000; // 180 km → exit point mode + const POINT_BUDGET = 5000; + + // Viewport cache: avoid re-querying same area + let cachedBounds = null; // { south, north, west, east } + let cachedData = null; // array of rows + + // --- H3 cluster loading (existing logic) --- + const loadRes = async (res, url) => { + if (loading) return; + loading = true; + updatePhaseMsg(`Loading H3 res${res}...`, 'loading'); + + try { + performance.mark(`r${res}-s`); + const data = await db.query(` + SELECT h3_cell, sample_count, center_lat, center_lng, + dominant_source, source_count + FROM read_parquet('${url}') + `); + + viewer.h3Points.removeAll(); + const scalar = new Cesium.NearFarScalar(1.5e2, 1.5, 8.0e6, 0.3); + let total = 0; + + for (const row of data) { + total += row.sample_count; + const size = Math.min(3 + Math.log10(row.sample_count) * 3.5, 18); + viewer.h3Points.add({ + id: { count: row.sample_count, source: row.dominant_source, lat: row.center_lat, lng: row.center_lng, resolution: res }, + position: Cesium.Cartesian3.fromDegrees(row.center_lng, row.center_lat, 0), + pixelSize: size, + color: Cesium.Color.fromCssColorString(SOURCE_COLORS[row.dominant_source] || '#666').withAlpha(0.85), + scaleByDistance: scalar, + }); + } + + performance.mark(`r${res}-e`); + performance.measure(`r${res}`, `r${res}-s`, `r${res}-e`); + const elapsed = performance.getEntriesByName(`r${res}`).pop().duration; + + updateStats(`H3 Res${res}`, data.length, total, `${(elapsed/1000).toFixed(1)}s`); + updatePhaseMsg(`${data.length.toLocaleString()} clusters, ${total.toLocaleString()} samples. ${res < 8 ? 'Zoom in for finer detail.' : 'Zoom closer for individual samples.'}`, 'done'); + + currentRes = res; + console.log(`Res${res}: ${data.length} clusters in ${elapsed.toFixed(0)}ms`); + } catch(err) { + console.error(`Failed to load res${res}:`, err); + updatePhaseMsg(`Failed to load H3 res${res} — try zooming again.`, 'loading'); + } finally { + loading = false; + } + }; + + // --- Get camera viewport bounds --- + function getViewportBounds() { + const rect = viewer.camera.computeViewRectangle(viewer.scene.globe.ellipsoid); + if (!rect) return null; + return { + south: Cesium.Math.toDegrees(rect.south), + north: Cesium.Math.toDegrees(rect.north), + west: Cesium.Math.toDegrees(rect.west), + east: Cesium.Math.toDegrees(rect.east) + }; + } + + // --- Check if viewport is within cached bounds --- + function isWithinCache(bounds) { + if (!cachedBounds || !bounds) return false; + return bounds.south >= cachedBounds.south && + bounds.north <= cachedBounds.north && + bounds.west >= cachedBounds.west && + bounds.east <= cachedBounds.east; + } + + // --- Load individual samples for current viewport --- + async function loadViewportSamples() { + const myReqId = ++requestId; + const bounds = getViewportBounds(); + if (!bounds) return; + + // If viewport is within cached area, just re-render from cache + if (isWithinCache(bounds) && cachedData) { + renderSamplePoints(cachedData, bounds); + return; + } + + // Fetch with 30% padding for smooth panning + const latPad = (bounds.north - bounds.south) * 0.3; + const lngPad = (bounds.east - bounds.west) * 0.3; + const padded = { + south: bounds.south - latPad, + north: bounds.north + latPad, + west: bounds.west - lngPad, + east: bounds.east + lngPad + }; + + updatePhaseMsg('Loading individual samples...', 'loading'); + + try { + performance.mark('sp-s'); + const data = await db.query(` + SELECT pid, label, source, latitude, longitude, + place_name, result_time + FROM read_parquet('${lite_url}') + WHERE latitude BETWEEN ${padded.south} AND ${padded.north} + AND longitude BETWEEN ${padded.west} AND ${padded.east} + LIMIT ${POINT_BUDGET} + `); + performance.mark('sp-e'); + performance.measure('sp', 'sp-s', 'sp-e'); + const elapsed = performance.getEntriesByName('sp').pop().duration; + + // Stale guard: discard if a newer request was issued + if (myReqId !== requestId) { + console.log(`Discarding stale sample response (req ${myReqId}, current ${requestId})`); + return; + } + + // Cache the padded bounds + data + cachedBounds = padded; + cachedData = Array.from(data); + + renderSamplePoints(cachedData, bounds); + + updateStats('Samples', cachedData.length, cachedData.length, `${(elapsed/1000).toFixed(1)}s`); + updatePhaseMsg(`${cachedData.length.toLocaleString()} individual samples. Click one for details.`, 'done'); + console.log(`Point mode: ${cachedData.length} samples in ${elapsed.toFixed(0)}ms`); + + } catch(err) { + if (myReqId !== requestId) return; + console.error("Viewport sample query failed:", err); + updatePhaseMsg('Sample query failed — try again.', 'loading'); + } + } + + // --- Render sample points on globe --- + function renderSamplePoints(data, bounds) { + viewer.samplePoints.removeAll(); + const scalar = new Cesium.NearFarScalar(1e2, 8, 2e5, 3); + + for (const row of data) { + const color = SOURCE_COLORS[row.source] || '#666'; + viewer.samplePoints.add({ + id: { + type: 'sample', + pid: row.pid, + label: row.label, + source: row.source, + lat: row.latitude, + lng: row.longitude, + place_name: row.place_name, + result_time: row.result_time + }, + position: Cesium.Cartesian3.fromDegrees(row.longitude, row.latitude, 0), + pixelSize: 6, + color: Cesium.Color.fromCssColorString(color).withAlpha(0.9), + scaleByDistance: scalar, + }); + } + } + + // --- Mode transitions --- + function enterPointMode(pushHistory) { + mode = 'point'; + viewer._globeState.mode = 'point'; + viewer.h3Points.show = false; + viewer.samplePoints.show = true; + if (pushHistory !== false) history.pushState(null, '', buildHash(viewer)); + loadViewportSamples(); + console.log('Entered point mode'); + } + + function exitPointMode(pushHistory) { + mode = 'cluster'; + viewer._globeState.mode = 'cluster'; + viewer.samplePoints.show = false; + viewer.samplePoints.removeAll(); + viewer.h3Points.show = true; + if (pushHistory !== false) history.pushState(null, '', buildHash(viewer)); + cachedBounds = null; + cachedData = null; + + // Restore cluster stats + let clusterCount = viewer.h3Points.length; + updateStats(`H3 Res${currentRes}`, clusterCount, '—', '—'); + updatePhaseMsg(`${clusterCount.toLocaleString()} clusters. Zoom closer for individual samples.`, 'done'); + console.log('Exited point mode'); + } + + // --- Camera change handler --- + let timer = null; + viewer.camera.changed.addEventListener(() => { + if (timer) clearTimeout(timer); + timer = setTimeout(async () => { + const h = viewer.camera.positionCartographic.height; + + // Determine target mode with hysteresis + const targetMode = h < ENTER_POINT_ALT ? 'point' + : h > EXIT_POINT_ALT ? 'cluster' + : mode; + + if (targetMode === 'point' && mode !== 'point') { + // Make sure we're at res8 clusters before transitioning + if (currentRes !== 8 && !loading) { + await loadRes(8, h3_res8_url); + } + enterPointMode(); + } else if (targetMode === 'cluster' && mode !== 'cluster') { + exitPointMode(); + // Reload appropriate resolution + const target = h > 3000000 ? 4 : h > 300000 ? 6 : 8; + if (target !== currentRes && !loading) { + await loadRes(target, { 4: h3_res4_url, 6: h3_res6_url, 8: h3_res8_url }[target]); + } + } else if (targetMode === 'point') { + // Already in point mode — update viewport samples + loadViewportSamples(); + } else { + // Cluster mode — check if resolution should change + const target = h > 3000000 ? 4 : h > 300000 ? 6 : 8; + if (target !== currentRes && !loading) { + await loadRes(target, { 4: h3_res4_url, 6: h3_res6_url, 8: h3_res8_url }[target]); + } + } + + // Update URL hash (replaceState for continuous movement) + if (!viewer._suppressHashWrite) { + history.replaceState(null, '', buildHash(viewer)); + } + }, 600); + }); + viewer.camera.percentageChanged = 0.1; + + // --- Handle browser back/forward --- + window.addEventListener('hashchange', async () => { + const state = readHash(); + if (state.lat == null || state.lng == null) return; + + viewer._suppressHashWrite = true; + clearTimeout(viewer._suppressTimer); + viewer.camera.cancelFlight(); + viewer.camera.flyTo({ + destination: Cesium.Cartesian3.fromDegrees(state.lng, state.lat, state.alt || 20000000), + orientation: { + heading: Cesium.Math.toRadians(state.heading), + pitch: Cesium.Math.toRadians(state.pitch) + }, + duration: 1.5, + }); + + // After flight settles, force mode and clear suppress flag + viewer._suppressTimer = setTimeout(() => { + viewer._suppressHashWrite = false; + const s = readHash(); + if (s.mode === 'point' && mode !== 'point') enterPointMode(false); + else if (s.mode !== 'point' && mode === 'point') exitPointMode(false); + }, 2000); + + // Handle pid selection + if (state.pid) { + viewer._globeState.selectedPid = state.pid; + try { + const sample = await db.query(` + SELECT pid, label, source, latitude, longitude, place_name, result_time + FROM read_parquet('${lite_url}') + WHERE pid = '${state.pid.replace(/'/g, "''")}' + LIMIT 1 + `); + if (sample && sample.length > 0) { + const s = sample[0]; + updateSampleCard({ + pid: s.pid, label: s.label, source: s.source, + lat: s.latitude, lng: s.longitude, + place_name: s.place_name, result_time: s.result_time + }); + } + } catch(err) { + console.error("Hash pid query failed:", err); + } + } else { + viewer._globeState.selectedPid = null; + updateClusterCard(null); + } + }); + + // --- Share button --- + const shareBtn = document.getElementById('shareBtn'); + if (shareBtn) { + shareBtn.addEventListener('click', async () => { + history.replaceState(null, '', buildHash(viewer)); + try { + await navigator.clipboard.writeText(location.href); + const toast = document.getElementById('shareToast'); + if (toast) { + toast.style.opacity = '1'; + setTimeout(() => { toast.style.opacity = '0'; }, 2000); + } + } catch(err) { + prompt('Copy this link:', location.href); + } + }); + } + + // --- Deep-link: restore selection from initial hash --- + const ih = viewer._initialHash; + if (ih.pid) { + viewer._globeState.selectedPid = ih.pid; + try { + const sample = await db.query(` + SELECT pid, label, source, latitude, longitude, place_name, result_time + FROM read_parquet('${lite_url}') + WHERE pid = '${ih.pid.replace(/'/g, "''")}' + LIMIT 1 + `); + if (sample && sample.length > 0) { + const s = sample[0]; + updateSampleCard({ + pid: s.pid, label: s.label, source: s.source, + lat: s.latitude, lng: s.longitude, + place_name: s.place_name, result_time: s.result_time + }); + const detail = await db.query(` + SELECT description FROM read_parquet('${wide_url}') + WHERE pid = '${ih.pid.replace(/'/g, "''")}' + LIMIT 1 + `); + if (detail && detail.length > 0) updateSampleDetail(detail[0]); + else updateSampleDetail({ description: '' }); + } + } catch(err) { + console.error("Deep-link pid query failed:", err); + } + } + + // Enable hash writing now that everything is initialized + viewer._suppressHashWrite = false; + + return "active"; +} +``` + +## How This Demo Works + +Pre-aggregated H3 hexagonal indices achieve near-instant globe rendering, with seamless drill-down to individual samples: + +| Phase | Data | Size | Points | +|-------|------|------|--------| +| **Instant** | H3 res4 | 580 KB | 38K clusters (continental) | +| **Zoom in** | H3 res6 | 1.6 MB | 112K clusters (city) | +| **Zoom more** | H3 res8 | 2.5 MB | 176K clusters (neighborhood) | +| **Zoom deep** | Map lite | 60 MB (range req.) | Up to 5K individual samples | +| **Click sample** | Full dataset | ~280 MB (range req.) | Full metadata for 1 sample | + +**4 parquet files, zero backend.** All queries run in your browser via DuckDB-WASM with HTTP range requests — only the bytes you need are transferred. + +## See Also + +- [Cesium Globe (All Points)](/tutorials/parquet_cesium_isamples_wide.html) — Full point-level rendering +- [Interactive Explorer](/tutorials/isamples_explorer.html) — Search and filter with facets +- [Deep-Dive Analysis](/tutorials/zenodo_isamples_analysis.html) — DuckDB-WASM SQL tutorial diff --git a/tutorials/zenodo_isamples_analysis.qmd b/tutorials/zenodo_isamples_analysis.qmd index 48793f9..97b0e56 100644 --- a/tutorials/zenodo_isamples_analysis.qmd +++ b/tutorials/zenodo_isamples_analysis.qmd @@ -27,12 +27,17 @@ This tutorial demonstrates how to efficiently analyze large geospatial datasets ## Dataset Information -**Primary dataset** (Jan 2026): -- **URL**: `https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet` -- **Size**: ~280 MB wide format, 6.7M MaterialSampleRecords (20M total rows) +**Primary dataset** (Jan 2026, H3-indexed): +- **URL**: `https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide_h3.parquet` +- **Size**: ~292 MB wide format with H3 indices, 6.7M MaterialSampleRecords (20M total rows) +- **H3 columns**: Pre-computed `h3_res4`, `h3_res6`, `h3_res8` (BIGINT) for spatial grouping - **Sources**: SESAR (4.6M), OpenContext (1M), GEOME (605K), Smithsonian (322K) - **Hosting**: Cloudflare R2 with HTTP range request support +**Facet summaries** (2KB, instant): +- **URL**: `https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_facet_summaries.parquet` +- **Schema**: `facet_type`, `facet_value`, `scheme`, `count` + **Note**: *Data was originally archived on Zenodo and is now served from Cloudflare R2 for better performance and reliability.* **Fallback dataset** (if remote data fails): @@ -81,8 +86,11 @@ d3 = require("d3@7") topojson = require("topojson-client@3") // Dataset URLs - try multiple options for CORS compatibility -// Primary: Cloudflare R2 (Jan 2026 wide format) +// Primary: Cloudflare R2 (Jan 2026 wide format with H3 indices) parquet_urls = [ + 'https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide_h3.parquet', + + // Fallback: original wide format without H3 'https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet', // Fallback: older versions @@ -90,6 +98,9 @@ parquet_urls = [ 'https://zenodo.org/api/records/15278211/files/isamples_export_2025_04_21_16_23_46_geo.parquet/content' ] +// Pre-computed facet summaries (2KB - loads instantly) +facet_summaries_url = 'https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_facet_summaries.parquet' + // Test CORS and find working URL - with rate limiting protection working_parquet_url = { // Check if we've recently failed (to avoid repeated rate limiting) @@ -419,36 +430,89 @@ geo_stats = { }; } -// Regional analysis using bounding boxes +// Data-driven regional analysis using H3 res4 cell grouping +// Replaces hardcoded CASE WHEN bounding boxes with dynamic discovery regional_data = { - const result = await db.query(` - SELECT - CASE - WHEN sample_location_longitude BETWEEN -125 AND -66 - AND sample_location_latitude BETWEEN 24 AND 50 THEN 'North America' - WHEN sample_location_longitude BETWEEN -11 AND 40 - AND sample_location_latitude BETWEEN 35 AND 71 THEN 'Europe' - WHEN sample_location_longitude BETWEEN 95 AND 141 - AND sample_location_latitude BETWEEN 18 AND 54 THEN 'East Asia' - WHEN sample_location_longitude BETWEEN 113 AND 154 - AND sample_location_latitude BETWEEN -44 AND -10 THEN 'Australia' - ELSE 'Other' - END as region, - source_collection, - count(*) as sample_count, - avg(sample_location_latitude) as avg_lat, - avg(sample_location_longitude) as avg_lon - FROM isamples_data - WHERE sample_location_latitude IS NOT NULL - AND sample_location_longitude IS NOT NULL - GROUP BY 1, 2 - ORDER BY region, sample_count DESC - `); - // Convert BigInt values to Numbers - return result.toArray().map(row => ({ - ...row, - sample_count: Number(row.sample_count) - })); + // Check if h3_res4 column exists (H3-indexed file) + let hasH3 = false; + try { + const colCheck = await db.query(`SELECT h3_res4 FROM isamples_data LIMIT 1`); + hasH3 = true; + } catch (e) { + hasH3 = false; + } + + if (hasH3) { + // H3-based regional grouping: discover dense clusters dynamically + const result = await db.query(` + SELECT + h3_res4, + COUNT(*) as sample_count, + AVG(sample_location_latitude) as avg_lat, + AVG(sample_location_longitude) as avg_lon, + COUNT(DISTINCT source_collection) as source_count, + MODE(source_collection) as dominant_source + FROM isamples_data + WHERE sample_location_latitude IS NOT NULL + AND sample_location_longitude IS NOT NULL + AND h3_res4 IS NOT NULL + GROUP BY h3_res4 + HAVING COUNT(*) > 100 + ORDER BY sample_count DESC + `); + // Assign region labels based on centroid location + return result.toArray().map(row => { + const lat = row.avg_lat; + const lon = row.avg_lon; + let region; + if (lon >= -130 && lon <= -60 && lat >= 20 && lat <= 55) region = 'North America'; + else if (lon >= -15 && lon <= 45 && lat >= 30 && lat <= 75) region = 'Europe'; + else if (lon >= 90 && lon <= 150 && lat >= 15 && lat <= 55) region = 'East Asia'; + else if (lon >= 110 && lon <= 160 && lat >= -50 && lat <= -5) region = 'Australia'; + else if (lon >= -90 && lon <= -30 && lat >= -60 && lat <= 15) region = 'South America'; + else if (lon >= -20 && lon <= 55 && lat >= -40 && lat <= 30) region = 'Africa'; + else region = 'Other'; + return { + region, + source_collection: row.dominant_source, + sample_count: Number(row.sample_count), + avg_lat: row.avg_lat, + avg_lon: row.avg_lon, + h3_cell: row.h3_res4, + source_count: Number(row.source_count) + }; + }); + } else { + // Fallback for non-H3 files: use simple lat/lon-based grouping + const result = await db.query(` + SELECT + source_collection, + count(*) as sample_count, + avg(sample_location_latitude) as avg_lat, + avg(sample_location_longitude) as avg_lon + FROM isamples_data + WHERE sample_location_latitude IS NOT NULL + AND sample_location_longitude IS NOT NULL + GROUP BY source_collection + ORDER BY sample_count DESC + `); + return result.toArray().map(row => { + const lat = row.avg_lat; + const lon = row.avg_lon; + let region = 'Other'; + if (lon >= -130 && lon <= -60 && lat >= 20 && lat <= 55) region = 'North America'; + else if (lon >= -15 && lon <= 45 && lat >= 30 && lat <= 75) region = 'Europe'; + else if (lon >= 90 && lon <= 150 && lat >= 15 && lat <= 55) region = 'East Asia'; + else if (lon >= 110 && lon <= 160 && lat >= -50 && lat <= -5) region = 'Australia'; + return { + region, + source_collection: row.source_collection, + sample_count: Number(row.sample_count), + avg_lat: row.avg_lat, + avg_lon: row.avg_lon + }; + }); + } } ``` @@ -461,8 +525,10 @@ md` - **Latitude range**: ${geo_stats.min_lat.toFixed(3)}° to ${geo_stats.max_lat.toFixed(3)}° - **Longitude range**: ${geo_stats.min_lon.toFixed(3)}° to ${geo_stats.max_lon.toFixed(3)}° - **Average location**: ${geo_stats.avg_lat.toFixed(3)}°, ${geo_stats.avg_lon.toFixed(3)}° -- **Total regional records**: ${regional_data.length} -- **Regions found**: ${[...new Set(regional_data.map(d => d.region))].join(', ')} +- **Dense H3 clusters**: ${regional_data.length} (cells with >100 samples) +- **Regions discovered**: ${[...new Set(regional_data.map(d => d.region))].join(', ')} + +*Regional grouping is data-driven using H3 resolution-4 hexagonal cells, replacing hardcoded bounding boxes.* ` ``` @@ -486,27 +552,26 @@ viewof selected_region = Inputs.select( ```{ojs} //| label: regional-chart -// Regional distribution chart +// Regional distribution chart (data-driven from H3 clusters) regional_chart = { - // Validate that regional_data is an array if (!Array.isArray(regional_data)) { return html`
Error: Regional data is not available
`; } - - // Aggregate the regional data by region like we do for source data + + // Aggregate H3 cell data by discovered region const regionTotals = d3.rollup( - regional_data, - v => d3.sum(v, d => d.sample_count), + regional_data, + v => d3.sum(v, d => d.sample_count), d => d.region ); - + const aggregatedData = Array.from(regionTotals, ([region, total]) => ({ region: region, sample_count: total })).sort((a, b) => b.sample_count - a.sample_count); - + return Plot.plot({ - title: `Sample Distribution by Region (${aggregatedData.length} regions)`, + title: `Sample Distribution by Region (H3-derived, ${aggregatedData.length} regions)`, width: 700, height: 300, marginLeft: 120, @@ -527,7 +592,7 @@ regional_chart = { }), Plot.text(aggregatedData, { x: "sample_count", - y: "region", + y: "region", text: d => d3.format("~s")(d.sample_count), dx: 10, textAnchor: "start" @@ -714,42 +779,76 @@ Explore the distribution of material categories across different sources. ```{ojs} //| label: material-analysis -// Get top material categories by source +// Material data: use pre-computed facet summaries for instant results +// Falls back to full-scan if summaries unavailable material_data = { - const result = await db.query(` - SELECT - source_collection, - has_material_category, - count(*) as category_count - FROM isamples_data - WHERE has_material_category IS NOT NULL - GROUP BY source_collection, has_material_category - ORDER BY source_collection, category_count DESC - `); - // Convert BigInt values to Numbers - return result.toArray().map(row => ({ - ...row, - category_count: Number(row.category_count) - })); + try { + // Try pre-computed summaries first (2KB, instant) + const result = await db.query(` + SELECT + facet_value as has_material_category, + 'ALL' as source_collection, + count as category_count + FROM read_parquet('${facet_summaries_url}') + WHERE facet_type = 'material' + ORDER BY count DESC + `); + return result.toArray().map(row => ({ + ...row, + category_count: Number(row.category_count) + })); + } catch (e) { + console.warn("Facet summaries unavailable, falling back to full scan:", e.message); + const result = await db.query(` + SELECT + source_collection, + has_material_category, + count(*) as category_count + FROM isamples_data + WHERE has_material_category IS NOT NULL + GROUP BY source_collection, has_material_category + ORDER BY source_collection, category_count DESC + `); + return result.toArray().map(row => ({ + ...row, + category_count: Number(row.category_count) + })); + } } -// Get top 10 categories overall +// Top categories from pre-computed summaries (instant) top_categories = { - const result = await db.query(` - SELECT - has_material_category, - count(*) as total_count - FROM isamples_data - WHERE has_material_category IS NOT NULL - GROUP BY has_material_category - ORDER BY total_count DESC - LIMIT 10 - `); - // Convert BigInt values to Numbers - return result.toArray().map(row => ({ - ...row, - total_count: Number(row.total_count) - })); + try { + const result = await db.query(` + SELECT + facet_value as has_material_category, + count as total_count + FROM read_parquet('${facet_summaries_url}') + WHERE facet_type = 'material' + ORDER BY count DESC + LIMIT 10 + `); + return result.toArray().map(row => ({ + ...row, + total_count: Number(row.total_count) + })); + } catch (e) { + console.warn("Facet summaries unavailable, falling back to full scan:", e.message); + const result = await db.query(` + SELECT + has_material_category, + count(*) as total_count + FROM isamples_data + WHERE has_material_category IS NOT NULL + GROUP BY has_material_category + ORDER BY total_count DESC + LIMIT 10 + `); + return result.toArray().map(row => ({ + ...row, + total_count: Number(row.total_count) + })); + } } ```