corpus with examples#

Examples related to the corpus submodule.

# Authors: The scikit-plots developers
# SPDX-License-Identifier: BSD-3-Clause

Download the related packages#

First we download the media preproccess libraries (text, image, audio or video). pip install nltk gensim langdetect faster-whisper openai-whisper pytesseract youtube-transcript-api sudo apt-get install tesseract-ocr pip install scikit-plots[corpus]

# .. seealso::
#    * galleries/examples/00-jupyter_notebooks/corpus/plot_corpus_from_any_media_notebook.ipynb

# import faster_whisper
# import whisper  # openai-whisper
# import youtube_transcript_api

import os
import json
import sys
import textwrap
from pathlib import Path

import scikitplot as sp
from scikitplot import corpus

from scikitplot.corpus._schema import (
    CorpusDocument,
    SourceType,
    MatchMode,
    SectionType,
    ChunkingStrategy,
    _PROMOTED_RAW_KEYS,
)
from scikitplot.corpus._base import DocumentReader, DefaultFilter
from scikitplot.corpus._normalizers import (
    NormalizationPipeline,
    UnicodeNormalizer,
    WhitespaceNormalizer,
)
from scikitplot.corpus._enrichers._nlp_enricher import NLPEnricher, EnricherConfig
from scikitplot.corpus._similarity._similarity import (
    SimilarityIndex,
    SearchConfig,
    SearchResult,
)
from scikitplot.corpus._adapters import (
    to_langchain_documents,
    to_langgraph_state,
    to_mcp_resources,
    to_mcp_tool_result,
    to_huggingface_dataset,
    to_rag_tuples,
    to_jsonl,
    MCPCorpusServer,
)

# ===========================================================================
# HELPER: print section banners
# ===========================================================================

def banner(title: str, char: str = "=") -> None:
    line = char * 72
    print(f"\n{line}\n  {title}\n{line}\n")

def mini_banner(title: str) -> None:
    print(f"\n  --- {title} ---\n")

def show_doc(doc: CorpusDocument, index: int = 0) -> None:
    """Pretty-print a single CorpusDocument."""
    text_preview = doc.text[:100].replace("\n", " ")
    norm_preview = (doc.normalized_text or "")[:80].replace("\n", " ")
    print(f"  [{index:3d}] doc_id={doc.doc_id[:12]}…  source_type={doc.source_type}")
    print(f"        text: {text_preview!r}…")
    if norm_preview:
        print(f"        norm: {norm_preview!r}…")
    if doc.tokens:
        print(f"        tokens({len(doc.tokens)}): {doc.tokens[:8]}…")
    if doc.keywords:
        print(f"        keywords: {doc.keywords[:6]}")
    if doc.timecode_start is not None:
        print(f"        timecode: {doc.timecode_start:.1f}s – {doc.timecode_end:.1f}s")
    if doc.confidence is not None:
        print(f"        confidence: {doc.confidence:.3f}")
    if doc.page_number is not None:
        print(f"        page: {doc.page_number}")

# ===========================================================================
# PHASE 1: INGEST — Process all 5 source types via DocumentReader
# ===========================================================================

banner("PHASE 1: INGEST — 5 Source Types via DocumentReader")

all_documents: list[CorpusDocument] = []
source_log: list[dict] = []

# --- Source ①: Web Article (HTML) ---
# In production: DocumentReader.from_url("https://www.who.int/europe/news/item/...")
# Here we use the local text proxy (same content)
mini_banner("Source ①: Web Article (text proxy for HTML URL)")
try:
    if Path("who_health_care_article.txt").exists():
        # from local raw html page
        reader = DocumentReader.create(
            Path("who_health_care_article.txt"),
            source_type=SourceType.WEB,
            source_title="Out-of-pocket payments for health care unaffordable for millions in Europe",
            source_author="WHO Regional Office for Europe",
            source_date="2023-12-12",
            collection_id="who-greece-financial-protection",
        )
    else:
        # from url raw html page DocumentReader.from_url for raw html
        reader = DocumentReader.from_url(
            "https://www.who.int/europe/news/item/12-12-2023-out-of-pocket-payments-for-primary-health-care-unaffordable-for-millions-in-europe-new-who-report-shows",
        )
    docs = list(reader.get_documents())
    all_documents.extend(docs)
    source_log.append({"type": "web_article", "n_docs": len(docs), "status": "OK"})
    print(f"  ✓ Web article: {len(docs)} chunks ingested")
    if docs:
        show_doc(docs[0], 0)
except Exception as e:
    source_log.append({"type": "web_article", "n_docs": 0, "status": f"ERROR: {e}"})
    print(f"  ✗ Web article: {e}")

========================================================================
  PHASE 1: INGEST — 5 Source Types via DocumentReader
========================================================================


  --- Source ①: Web Article (text proxy for HTML URL) ---

  ✓ Web article: 91 chunks ingested
  [  0] doc_id=9432e3e0cdf3…  source_type=web
        text: 'Out-of-pocket payments for primary health care unaffordable for millions in Europe, new WHO report s'…

docs[60].text

'Catastrophic out-of-pocket payments are mainly driven by household spending on services that are commonly delivered or managed in primary-care settings, indicating important gaps in the coverage of primary care in many countries.'

all_documents[60].text

'Catastrophic out-of-pocket payments are mainly driven by household spending on services that are commonly delivered or managed in primary-care settings, indicating important gaps in the coverage of primary care in many countries.'

# --- Ingestion Summary ---
mini_banner("Ingestion Summary")
for entry in source_log:
    status = "✓" if entry["status"] == "OK" else "⚠"
    print(f"  {status} {entry['type']:25s} → {entry['n_docs']:3d} docs  [{entry['status']}]")
print(f"\n  Total documents in corpus: {len(all_documents)}")

--- Ingestion Summary ---

✓ web_article               →  91 docs  [OK]

Total documents in corpus: 91

# ===========================================================================
# PHASE 4: INDEX — Build SimilarityIndex (KEYWORD mode, no embeddings)
# ===========================================================================

banner("PHASE 4: INDEX — Build SimilarityIndex (BM25 keyword mode)")

index = SimilarityIndex(config=SearchConfig(match_mode="keyword", top_k=5))
index.build(all_documents)
print(f"  ✓ Index built: {index.n_documents} documents, dense={index.has_embeddings}")

========================================================================
  PHASE 4: INDEX — Build SimilarityIndex (BM25 keyword mode)
========================================================================

  ✓ Index built: 91 documents, dense=False

# ===========================================================================
# PHASE 6: ADAPTERS — Export to every downstream format
# ===========================================================================

banner("PHASE 6: ADAPTERS — Export to LangChain / MCP / RAG / LangGraph / HF")

# --- 6a: LangChain Documents ---
mini_banner("6a: LangChain Documents")
lc_docs = to_langchain_documents(all_documents)
first = lc_docs[0]
if isinstance(first, dict):
    print(f"  ✓ {len(lc_docs)} LangChain docs (dict fallback — langchain not installed)")
    print(f"    keys: {list(first.keys())}")
    print(f"    page_content[:80]: {first['page_content'][:80]!r}")
    print(f"    metadata keys: {sorted(first['metadata'].keys())[:10]}")
else:
    print(f"  ✓ {len(lc_docs)} LangChain Document objects")

# --- 6b: LangGraph State ---
mini_banner("6b: LangGraph State")
state = to_langgraph_state(
    all_documents,
    query="catastrophic health spending",
    match_mode="keyword",
)
print(f"  ✓ LangGraph state dict:")
print(f"    keys: {sorted(state.keys())}")
print(f"    n_results: {state['n_results']}")
print(f"    query: {state['query']!r}")

# --- 6c: MCP Resources ---
mini_banner("6c: MCP Resources (Model Context Protocol)")
resources = to_mcp_resources(all_documents[:3])
for r in resources[:2]:
    print(f"  resource:")
    print(f"    uri:      {r['uri']}")
    print(f"    name:     {r['name']}")
    print(f"    mimeType: {r['mimeType']}")
    print(f"    text[:60]: {r['text'][:60]!r}…")

# --- 6d: MCP Tool Result ---
mini_banner("6d: MCP Tool Result (tools/call response)")
tool_result = to_mcp_tool_result(all_documents[:3])
print(f"  ✓ MCP tool result:")
print(f"    isError: {tool_result['isError']}")
print(f"    content items: {len(tool_result['content'])}")
for item in tool_result["content"][:2]:
    print(f"    [{item['type']}] text[:60]: {item['text'][:60]!r}…")
    print(f"         annotations: {item['annotations']}")

# --- 6e: MCP Server (adapter class) ---
mini_banner("6e: MCP Server Adapter")
mcp_server = MCPCorpusServer(index=index, server_name="who-corpus")
tools = mcp_server.list_tools()
print(f"  ✓ MCPCorpusServer: {mcp_server}")
print(f"    tools: {[t['name'] for t in tools]}")
print(f"    tool schema: {json.dumps(tools[0]['inputSchema'], indent=6)[:200]}…")

# --- 6f: HuggingFace Dataset ---
mini_banner("6f: HuggingFace Dataset")
hf = to_huggingface_dataset(all_documents)
if isinstance(hf, dict):
    print(f"  ✓ HuggingFace column dict (datasets lib not installed)")
    print(f"    columns: {sorted(hf.keys())}")
    print(f"    rows: {len(hf['text'])}")
else:
    print(f"  ✓ HuggingFace Dataset: {hf}")

# --- 6g: RAG Tuples ---
mini_banner("6g: RAG Tuples (text, metadata, embedding)")
tuples = to_rag_tuples(all_documents[:3])
for i, (text, meta, emb) in enumerate(tuples):
    print(f"  [{i}] text[:50]: {text[:50]!r}")
    print(f"      meta keys: {sorted(meta.keys())[:8]}")
    print(f"      embedding: {type(emb).__name__}")

# --- 6h: JSONL ---
mini_banner("6h: JSONL Streaming")
lines = list(to_jsonl(all_documents[:3]))
print(f"  ✓ {len(lines)} JSONL lines")
for i, line in enumerate(lines[:2]):
    obj = json.loads(line)
    print(f"  [{i}] keys: {sorted(obj.keys())[:8]}…  text[:50]: {obj['text'][:50]!r}")

========================================================================
  PHASE 6: ADAPTERS — Export to LangChain / MCP / RAG / LangGraph / HF
========================================================================


  --- 6a: LangChain Documents ---

  ✓ 91 LangChain docs (dict fallback — langchain not installed)
    keys: ['page_content', 'metadata']
    page_content[:80]: 'Out-of-pocket payments for primary health care unaffordable for millions in Euro'
    metadata keys: ['char_end', 'char_start', 'chunk_index', 'chunking_strategy', 'doc_id', 'element_index', 'html_tag', 'section_type', 'source_file', 'source_type']

  --- 6b: LangGraph State ---

  ✓ LangGraph state dict:
    keys: ['documents', 'match_mode', 'n_results', 'query']
    n_results: 91
    query: 'catastrophic health spending'

  --- 6c: MCP Resources (Model Context Protocol) ---

  resource:
    uri:      corpus://9432e3e0cdf3de0d
    name:     https://www.who.int/europe/news/item/12-12-2023-out-of-pocket-payments-for-primary-health-care-unaffordable-for-millions-in-europe-new-who-report-shows
    mimeType: text/plain
    text[:60]: 'Out-of-pocket payments for primary health care unaffordable '…
  resource:
    uri:      corpus://d4100a2a7807259d
    name:     https://www.who.int/europe/news/item/12-12-2023-out-of-pocket-payments-for-primary-health-care-unaffordable-for-millions-in-europe-new-who-report-shows
    mimeType: text/plain
    text[:60]: 'Regions WHO Regional websites Africa Americas South-East Asi'…

  --- 6d: MCP Tool Result (tools/call response) ---

  ✓ MCP tool result:
    isError: False
    content items: 3
    [text] text[:60]: 'Out-of-pocket payments for primary health care unaffordable '…
         annotations: {'doc_id': '9432e3e0cdf3de0d', 'source_file': 'https://www.who.int/europe/news/item/12-12-2023-out-of-pocket-payments-for-primary-health-care-unaffordable-for-millions-in-europe-new-who-report-shows', 'source_title': None, 'chunk_index': 0, 'score': None}
    [text] text[:60]: 'Regions WHO Regional websites Africa Americas South-East Asi'…
         annotations: {'doc_id': 'd4100a2a7807259d', 'source_file': 'https://www.who.int/europe/news/item/12-12-2023-out-of-pocket-payments-for-primary-health-care-unaffordable-for-millions-in-europe-new-who-report-shows', 'source_title': None, 'chunk_index': 2, 'score': None}

  --- 6e: MCP Server Adapter ---

  ✓ MCPCorpusServer: MCPCorpusServer(name='who-corpus', n_docs=91)
    tools: ['corpus_search']
    tool schema: {
      "type": "object",
      "properties": {
            "query": {
                  "type": "string",
                  "description": "Search query text"
            },
            "top_k": {
  …

  --- 6f: HuggingFace Dataset ---

  ✓ HuggingFace column dict (datasets lib not installed)
    columns: ['chunk_index', 'doc_id', 'language', 'metadata_json', 'source_file', 'source_title', 'source_type', 'text']
    rows: 91

  --- 6g: RAG Tuples (text, metadata, embedding) ---

  [0] text[:50]: 'Out-of-pocket payments for primary health care una'
      meta keys: ['char_end', 'char_start', 'chunk_index', 'chunking_strategy', 'doc_id', 'element_index', 'html_tag', 'section_type']
      embedding: NoneType
  [1] text[:50]: 'Regions WHO Regional websites Africa Americas Sout'
      meta keys: ['char_end', 'char_start', 'chunk_index', 'chunking_strategy', 'doc_id', 'element_index', 'html_tag', 'section_type']
      embedding: NoneType
  [2] text[:50]: 'Countries Albania Andorra Armenia Austria Azerbaij'
      meta keys: ['char_end', 'char_start', 'chunk_index', 'chunking_strategy', 'doc_id', 'element_index', 'html_tag', 'section_type']
      embedding: NoneType

  --- 6h: JSONL Streaming ---

  ✓ 3 JSONL lines
  [0] keys: ['char_end', 'char_start', 'chunk_index', 'chunking_strategy', 'doc_id', 'element_index', 'html_tag', 'section_type']…  text[:50]: 'Out-of-pocket payments for primary health care una'
  [1] keys: ['char_end', 'char_start', 'chunk_index', 'chunking_strategy', 'doc_id', 'element_index', 'html_tag', 'section_type']…  text[:50]: 'Regions WHO Regional websites Africa Americas Sout'

Tags: model-type: classification model-workflow: corpus plot-type: bar level: beginner purpose: showcase

Total running time of the script: (0 minutes 0.705 seconds)

Related examples