corpus Knowledge and Information local .png with examples#

Examples related to the corpus submodule.

# Authors: The scikit-plots developers
# SPDX-License-Identifier: BSD-3-Clause
import os
import json
import sys
import textwrap
from pathlib import Path

import scikitplot as sp
from scikitplot import corpus
from scikitplot.corpus import (
    DocumentReader,
    CorpusPipeline,
    SentenceChunker,
    SentenceChunkerConfig,
    ExportFormat,
    CorpusDocument,
    SourceType,
    SentenceBackend,
    EnricherConfig,
    NLPEnricher,
)

via CorpusPipeline#

pipeline_zip = CorpusPipeline(
    chunker=SentenceChunker(SentenceChunkerConfig(backend=SentenceBackend.NLTK)),
    output_dir=Path("output/"),
    export_format=ExportFormat.CSV,
)
result_zip = pipeline_zip.run(Path("data/echo_of_the_wise/AI_Generated_Image_1ix.png"))
result_zip
PipelineResult(source='data/echo_of_the_wise/AI_Generated_Image_1ix.png', n_documents=8, n_omitted=0, n_embedded=0, elapsed=4.2s, output=output/AI_Generated_Image_1ix.csv)
import pandas as pd
from pprint import pprint

pprint(pd.read_csv(result_zip.output_path).head().to_dict())
{'act': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'bbox': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'char_end': {0: 229, 1: 299, 2: 607, 3: 1011, 4: 1179},
 'char_start': {0: 6, 1: 230, 2: 301, 3: 608, 4: 1013},
 'chunk_index': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
 'chunking_strategy': {0: 'sentence',
                       1: 'sentence',
                       2: 'sentence',
                       3: 'sentence',
                       4: 'sentence'},
 'collection_id': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'confidence': {0: 0.6372, 1: 0.6372, 2: 0.6372, 3: 0.6372, 4: 0.6372},
 'content_hash': {0: '72d9a66ad2010fe6f95c336e9b967aef',
                  1: '2ae9055ee90f61d3ed9a6ee7a8425acc',
                  2: '929db36bee285ec5d8f380cacba9e157',
                  3: '8fd437ea14514826c2c1dcae91e2c3c3',
                  4: '1890c9fdc4ee3b5ba62205dd8c1c7cf0'},
 'doc_id': {0: '033fe979c6bb209c',
            1: '2829df524657858f',
            2: '4f3824727d6cf878',
            3: '2c4188cef46cee22',
            4: 'b203e048a59ee500'},
 'doi': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'frame_index': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'image_height': {0: 1024, 1: 1024, 2: 1024, 3: 1024, 4: 1024},
 'image_width': {0: 1024, 1: 1024, 2: 1024, 3: 1024, 4: 1024},
 'isbn': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'keywords': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'language': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'lemmas': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'line_number': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'modality': {0: 'text', 1: 'text', 2: 'text', 3: 'text', 4: 'text'},
 'normalized_text': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'ocr_engine': {0: 'tesseract',
                1: 'tesseract',
                2: 'tesseract',
                3: 'tesseract',
                4: 'tesseract'},
 'page_number': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0},
 'paragraph_index': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'parent_doc_id': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'raw_dtype': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'raw_shape': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'scene_number': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'section_type': {0: 'text', 1: 'text', 2: 'text', 3: 'text', 4: 'text'},
 'source_author': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'source_date': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'source_file': {0: 'AI_Generated_Image_1ix.png',
                 1: 'AI_Generated_Image_1ix.png',
                 2: 'AI_Generated_Image_1ix.png',
                 3: 'AI_Generated_Image_1ix.png',
                 4: 'AI_Generated_Image_1ix.png'},
 'source_title': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'source_type': {0: 'image', 1: 'image', 2: 'image', 3: 'image', 4: 'image'},
 'stems': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'text': {0: 'ire uursacesced Caraga io\n'
             '10 Bi6doKew 8 Erna monet) 1b / aoe Maa ETT\n'
             'RCO RSP eo ere\n'
             '\n'
             'Memmnminsane(s)\n'
             'erklaren kannst, hast du\n'
             'Creerona eats\n'
             '\n'
             ' \n'
             ' \n'
             '      \n'
             '     \n'
             '   \n'
             ' \n'
             ' \n'
             ' \n'
             ' \n'
             ' \n'
             '  \n'
             ' \n'
             '\n'
             ' \n'
             '\n'
             'Brome ccrlhy | |\n'
             ' Petesercne | verstanden.',
          1: '>\n'
             'If you cannot explain\n'
             '» Sas ONAN Co oiag\n'
             'understand it well enough.',
          2: 'ge VIDA ND TRON\n'
             '\n'
             'Sa eas\n'
             'aE Nia)\n'
             '\n'
             '   \n'
             '\n'
             '‘ApiotoréAnc + AASEavSp0¢ , Richard P. Feynman j Albert '
             'Einstein\n'
             '\n'
             '384-322 BC - 356-323 BC | Mieza, Macedonia 1918-1988 | New York, '
             'USA — Princeton — Pasadena 99-1955 | Ulm — Princeton\n'
             '\n'
             ' \n'
             ' \n'
             ' \n'
             ' \n'
             '  \n'
             '  \n'
             ' \n'
             ' \n'
             '   \n'
             '\n'
             'ONSEN\n'
             'venient Cc ae\n'
             '| Crore mokoeoeri ae} A\n'
             '\n'
             'to a bartender.',
          3: 'aa ve, r\n'
             'es Clenrecemnnc\n'
             'ahupiingao ka taea\n'
             'POMBELCUICIC IN\n'
             'SSRI LIC\n'
             '\n'
             ' \n'
             '\n'
             'Pye matterhow much you kaov\n'
             'your words reach only as far as the\n'
             'other person can understand,\n'
             '\n'
             ': ugh $9) glen a lat 9 CS lb cle Lage I y\n'
             '\n'
             ' \n'
             '\n'
             '       \n'
             ' \n'
             '\n'
             'Ernest Rutherford F Mevlana\n'
             '1871-1937 | Nelson, NZ > Cambridge _—_ warm, Ed 3 | Balkh > '
             'Konya\n'
             '\n'
             '           \n'
             ' \n'
             '\n'
             'Sato r le Scholar aCe Is\n'
             '\n'
             'Simplicity is the mark\n'
             '7 Os true knowledge.',
          4: '“ (Focused)  (Pocused) | (Distracted)\n'
             '\n'
             'cenit)\n'
             '(Innocent)\n'
             '\n'
             '   \n'
             '\n'
             'You may know all the worlds, but the conversation sto,\n'
             'j at the other person’s intelligence and vision.'},
 'timecode_end': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'timecode_start': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'tokens': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
 'total_frames': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1},
 'url': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan}}
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

img = mpimg.imread(result_zip.source)
plt.imshow(img)
plt.axis('off')  # hides axes
plt.show()
plot corpus knowledge script

Tags: model-type: classification model-workflow: corpus plot-type: text level: beginner purpose: showcase

Total running time of the script: (0 minutes 4.513 seconds)

Related examples

corpus WHO European Region local .zip with examples

corpus WHO European Region local .zip with examples

corpus A Tale of Two Cities .mp3 with examples

corpus A Tale of Two Cities .mp3 with examples

corpus WHO European Region YouTube shorts with examples

corpus WHO European Region YouTube shorts with examples

corpus WHO European Region local or url per file with examples

corpus WHO European Region local or url per file with examples

Gallery generated by Sphinx-Gallery