corpus WHO European Region local .zip with examples#
Examples related to the corpus submodule.
# Authors: The scikit-plots developers
# SPDX-License-Identifier: BSD-3-Clause
import os
import json
import sys
import textwrap
from pathlib import Path
import scikitplot as sp
from scikitplot import corpus
from scikitplot.corpus import (
DocumentReader,
CorpusPipeline,
SentenceChunker,
SentenceChunkerConfig,
ExportFormat,
CorpusDocument,
SourceType,
SentenceBackend,
EnricherConfig,
NLPEnricher,
)
ZIP archive with per-extension kwargs#
Pass a nested "reader_kwargs" key to configure individual member types
inside the archive independently. Global kwargs go alongside it; per-extension
values always win when both specify the same key.
The pattern mirrors the ZipReader
constructor signature — the pipeline threads the outer dict straight through.
# zip_to_doc = list(
# DocumentReader.create(
# "data/WHO-EURO-2025-12555-52329-80560-eng.zip",
# reader_kwargs={
# ".mp3": {"transcribe": True, "whisper_model": "small"},
# # ".jpg": {"backend": "easyocr"}, # uncomment to enable OCR on images
# },
# ).get_documents()
# )
# zip_to_doc
The same ZIP via CorpusPipeline#
pipeline_zip = CorpusPipeline(
chunker=SentenceChunker(SentenceChunkerConfig(backend=SentenceBackend.NLTK)),
output_dir=Path("output/"),
export_format=ExportFormat.CSV,
reader_kwargs={
"reader_kwargs": {
".mp3": {"transcribe": True, "whisper_model": "small"},
# ".jpg": {"backend": "easyocr"},
},
},
)
result_zip = pipeline_zip.run(Path("data/WHO-EURO-2025-12555-52329-80560-eng.zip"))
result_zip
PipelineResult(source='data/WHO-EURO-2025-12555-52329-80560-eng.zip', n_documents=7, n_omitted=0, n_embedded=0, elapsed=18.7s, output=output/WHO-EURO-2025-12555-52329-80560-eng.csv)
Total running time of the script: (0 minutes 18.680 seconds)
Related examples
corpus WHO European Region YouTube shorts with examples
corpus WHO European Region YouTube shorts with examples
corpus WHO European Region local or url per file with examples
corpus WHO European Region local or url per file with examples