corpus Knowledge and Information local .png with examples#
Examples related to the corpus submodule.
# Authors: The scikit-plots developers
# SPDX-License-Identifier: BSD-3-Clause
import os
import json
import sys
import textwrap
from pathlib import Path
import scikitplot as sp
from scikitplot import corpus
from scikitplot.corpus import (
DocumentReader,
CorpusPipeline,
SentenceChunker,
SentenceChunkerConfig,
ExportFormat,
CorpusDocument,
SourceType,
SentenceBackend,
EnricherConfig,
NLPEnricher,
)
via CorpusPipeline#
pipeline_zip = CorpusPipeline(
chunker=SentenceChunker(SentenceChunkerConfig(backend=SentenceBackend.NLTK)),
output_dir=Path("output/"),
export_format=ExportFormat.CSV,
)
result_zip = pipeline_zip.run(Path("data/echo_of_the_wise/AI_Generated_Image_1ix.png"))
result_zip
PipelineResult(source='data/echo_of_the_wise/AI_Generated_Image_1ix.png', n_documents=8, n_omitted=0, n_embedded=0, elapsed=4.2s, output=output/AI_Generated_Image_1ix.csv)
import pandas as pd
from pprint import pprint
pprint(pd.read_csv(result_zip.output_path).head().to_dict())
{'act': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'bbox': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'char_end': {0: 229, 1: 299, 2: 607, 3: 1011, 4: 1179},
'char_start': {0: 6, 1: 230, 2: 301, 3: 608, 4: 1013},
'chunk_index': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
'chunking_strategy': {0: 'sentence',
1: 'sentence',
2: 'sentence',
3: 'sentence',
4: 'sentence'},
'collection_id': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'confidence': {0: 0.6372, 1: 0.6372, 2: 0.6372, 3: 0.6372, 4: 0.6372},
'content_hash': {0: '72d9a66ad2010fe6f95c336e9b967aef',
1: '2ae9055ee90f61d3ed9a6ee7a8425acc',
2: '929db36bee285ec5d8f380cacba9e157',
3: '8fd437ea14514826c2c1dcae91e2c3c3',
4: '1890c9fdc4ee3b5ba62205dd8c1c7cf0'},
'doc_id': {0: '033fe979c6bb209c',
1: '2829df524657858f',
2: '4f3824727d6cf878',
3: '2c4188cef46cee22',
4: 'b203e048a59ee500'},
'doi': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'frame_index': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'image_height': {0: 1024, 1: 1024, 2: 1024, 3: 1024, 4: 1024},
'image_width': {0: 1024, 1: 1024, 2: 1024, 3: 1024, 4: 1024},
'isbn': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'keywords': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'language': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'lemmas': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'line_number': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'modality': {0: 'text', 1: 'text', 2: 'text', 3: 'text', 4: 'text'},
'normalized_text': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'ocr_engine': {0: 'tesseract',
1: 'tesseract',
2: 'tesseract',
3: 'tesseract',
4: 'tesseract'},
'page_number': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0},
'paragraph_index': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'parent_doc_id': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'raw_dtype': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'raw_shape': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'scene_number': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'section_type': {0: 'text', 1: 'text', 2: 'text', 3: 'text', 4: 'text'},
'source_author': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'source_date': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'source_file': {0: 'AI_Generated_Image_1ix.png',
1: 'AI_Generated_Image_1ix.png',
2: 'AI_Generated_Image_1ix.png',
3: 'AI_Generated_Image_1ix.png',
4: 'AI_Generated_Image_1ix.png'},
'source_title': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'source_type': {0: 'image', 1: 'image', 2: 'image', 3: 'image', 4: 'image'},
'stems': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'text': {0: 'ire uursacesced Caraga io\n'
'10 Bi6doKew 8 Erna monet) 1b / aoe Maa ETT\n'
'RCO RSP eo ere\n'
'\n'
'Memmnminsane(s)\n'
'erklaren kannst, hast du\n'
'Creerona eats\n'
'\n'
' \n'
' \n'
' \n'
' \n'
' \n'
' \n'
' \n'
' \n'
' \n'
' \n'
' \n'
' \n'
'\n'
' \n'
'\n'
'Brome ccrlhy | |\n'
' Petesercne | verstanden.',
1: '>\n'
'If you cannot explain\n'
'» Sas ONAN Co oiag\n'
'understand it well enough.',
2: 'ge VIDA ND TRON\n'
'\n'
'Sa eas\n'
'aE Nia)\n'
'\n'
' \n'
'\n'
'‘ApiotoréAnc + AASEavSp0¢ , Richard P. Feynman j Albert '
'Einstein\n'
'\n'
'384-322 BC - 356-323 BC | Mieza, Macedonia 1918-1988 | New York, '
'USA — Princeton — Pasadena 99-1955 | Ulm — Princeton\n'
'\n'
' \n'
' \n'
' \n'
' \n'
' \n'
' \n'
' \n'
' \n'
' \n'
'\n'
'ONSEN\n'
'venient Cc ae\n'
'| Crore mokoeoeri ae} A\n'
'\n'
'to a bartender.',
3: 'aa ve, r\n'
'es Clenrecemnnc\n'
'ahupiingao ka taea\n'
'POMBELCUICIC IN\n'
'SSRI LIC\n'
'\n'
' \n'
'\n'
'Pye matterhow much you kaov\n'
'your words reach only as far as the\n'
'other person can understand,\n'
'\n'
': ugh $9) glen a lat 9 CS lb cle Lage I y\n'
'\n'
' \n'
'\n'
' \n'
' \n'
'\n'
'Ernest Rutherford F Mevlana\n'
'1871-1937 | Nelson, NZ > Cambridge _—_ warm, Ed 3 | Balkh > '
'Konya\n'
'\n'
' \n'
' \n'
'\n'
'Sato r le Scholar aCe Is\n'
'\n'
'Simplicity is the mark\n'
'7 Os true knowledge.',
4: '“ (Focused) (Pocused) | (Distracted)\n'
'\n'
'cenit)\n'
'(Innocent)\n'
'\n'
' \n'
'\n'
'You may know all the worlds, but the conversation sto,\n'
'j at the other person’s intelligence and vision.'},
'timecode_end': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'timecode_start': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'tokens': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan},
'total_frames': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1},
'url': {0: nan, 1: nan, 2: nan, 3: nan, 4: nan}}
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
img = mpimg.imread(result_zip.source)
plt.imshow(img)
plt.axis('off') # hides axes
plt.show()
Total running time of the script: (0 minutes 4.513 seconds)
Related examples
corpus WHO European Region local .zip with examples
corpus WHO European Region local .zip with examples
corpus WHO European Region YouTube shorts with examples
corpus WHO European Region YouTube shorts with examples
corpus WHO European Region local or url per file with examples
corpus WHO European Region local or url per file with examples