BuilderConfig#

class scikitplot.corpus.BuilderConfig(chunker='sentence', chunker_kwargs=<factory>, normalize=True, normalizer_steps=<factory>, normalizer_kwargs=<factory>, enrich=False, enricher_kwargs=<factory>, embed=False, embedding_model='all-MiniLM-L6-v2', embedding_kwargs=<factory>, build_index=False, index_kwargs=<factory>, source_title=None, source_author=None, source_type=None, collection_id=None, default_language=None, filter_kwargs=<factory>, max_download_bytes=524288000, download_timeout=120, download_max_retries=3, download_retry_backoff=1.0, max_archive_files=10000, max_archive_bytes=2147483648, probe_url_content_type=True, probe_url_timeout=15, max_workers=1)[source]#

Configuration for CorpusBuilder.

Parameters:

chunkerstr or object: Chunker to use. One of "sentence", "paragraph", "fixed_window", "word"; or a pre-configured chunker instance (either ChunkerBase subclass or new-style chunker — auto-bridged).
chunker_kwargsdict[str, Any]: Keyword arguments passed to the chunker constructor (ignored if chunker is already an instance).
normalizebool: Run normalisation pipeline after filtering.
normalizer_stepslist[str]: Normaliser names: "unicode", "whitespace", "html_strip", "lowercase", "dedup_lines". Default: ["unicode", "whitespace"].
normalizer_kwargsdict[str, Any]: Run TextNormalizer after filtering. Kwargs for NormalizerConfig.
enrichbool: Run NLPEnricher after normalisation.
enricher_kwargsdict[str, Any]: Kwargs for EnricherConfig.
embedbool: Run EmbeddingEngine after enrichment.
embedding_modelstr: Model name for EmbeddingEngine.
embedding_kwargsdict[str, Any]: Kwargs for EmbeddingEngine constructor.
build_indexbool: Build a SimilarityIndex after embedding.
index_kwargsdict[str, Any]: Kwargs for SearchConfig.
source_titlestr or None: Default source_title for all documents.
source_authorstr or None: Default source_author for all documents.
source_typestr or None: Default source_type (e.g., "book", "movie").
collection_idstr or None: Group identifier for this corpus build.
default_languagestr or None: ISO 639-1 language code.
filter_kwargsdict[str, Any]: Kwargs for DefaultFilter.
max_workersint: Parallelism for multi-file ingestion.
probe_url_content_typebool: When True (default), extensionless URLs are probed with an HTTP HEAD request to infer the correct reader before downloading. Disable to save a round-trip when all URLs have file extensions.
probe_url_timeoutint: HTTP timeout in seconds for probe_url_kind calls. Default: 15.

Parameters:

chunker (str | ChunkerBase | None)
chunker_kwargs (dict[str, Any])
normalize (bool)
normalizer_steps (list[Literal['unicode', 'whitespace', 'html_strip', 'lowercase', 'dedup_lines']])
normalizer_kwargs (dict[str, Any])
enrich (bool)
enricher_kwargs (dict[str, Any])
embed (bool)
embedding_model (str)
embedding_kwargs (dict[str, Any])
build_index (bool)
index_kwargs (dict[str, Any])
source_title (str | None)
source_author (str | None)
source_type (SourceType | str | None)
collection_id (str | None)
default_language (str | list[str] | None)
filter_kwargs (dict[str, Any])
max_download_bytes (int)
download_timeout (int)
download_max_retries (int)
download_retry_backoff (float)
max_archive_files (int)
max_archive_bytes (int)
probe_url_content_type (bool)
probe_url_timeout (int)
max_workers (int)

Notes

User note: Most users need only:

config = BuilderConfig(chunker="sentence", embed=True)

Everything else has sensible defaults.

build_index: bool = False#

chunker: str | ChunkerBase | None = 'sentence'#

chunker_kwargs: dict[str, Any][source]#

collection_id: str | None = None#

default_language: str | list[str] | None = None#

download_max_retries: int = 3#: Maximum retry attempts for transient HTTP errors (429, 500, 502, 503, 504) during URL downloads. Set to 0 to disable retries. Default: 3.

download_retry_backoff: float = 1.0#: Base delay in seconds for exponential back-off between download retries. Actual wait = download_retry_backoff * 2 ** attempt. Default: 1.0.

download_timeout: int = 120#

Type:: HTTP timeout for URL downloads in seconds. Default

embed: bool = False#

embedding_kwargs: dict[str, Any][source]#

embedding_model: str = 'all-MiniLM-L6-v2'#

enrich: bool = False#

enricher_kwargs: dict[str, Any][source]#

filter_kwargs: dict[str, Any][source]#

index_kwargs: dict[str, Any][source]#

max_archive_bytes: int = 2147483648#

2 GB.

Type:: Maximum cumulative extracted size per archive. Default

max_archive_files: int = 10000#

10,000.

Type:: Maximum file count inside a single archive. Default

max_download_bytes: int = 524288000#

500 MB.

Type:: Maximum download size per URL in bytes. Default

max_workers: int = 1#

normalize: bool = True#

normalizer_kwargs: dict[str, Any][source]#

normalizer_steps: list[Literal['unicode', 'whitespace', 'html_strip', 'lowercase', 'dedup_lines']][source]#

probe_url_content_type: bool = True#: Probe extensionless URLs with a HEAD request to determine the correct reader. When True (default), any URL that classify_url classifies as WEB_PAGE and has no file extension in its path is probed via probe_url_kind before routing. Set to False to skip the extra network round-trip (e.g. when all your URLs already carry file extensions or you want pure-offline operation). Default: True.

probe_url_timeout: int = 15#: HTTP timeout in seconds for the URL-probing HEAD request. Default: 15.

source_author: str | None = None#

source_title: str | None = None#

source_type: SourceType | str | None = None#