Index (cython) python-api with examples#
An example showing the Index class.
import numpy as np
import random; random.seed(0)
from pprint import pprint
import struct; print(struct.calcsize('P')*8)
64
from scikitplot.annoy._annoy import Index # cython Total: 160 concrete index-data-metrics types 8 S × 4 T × 5 metrics
from scikitplot.annoy import AnnoyIndex # Cpp 1 concrete index-data type (uint64 index float32 data)
# 32 bit int index
i = Index(10, "angular")
i.load(f"./test.tree")
# This might change in the future if we change the search algorithm, but in that case let's update the test
result1 = i.get_nns_by_item(0, 10)
# 64 bit uint index
j = AnnoyIndex(10, "angular")
for idx in range(i.get_n_items()):
j.add_item(idx, i.get_item(idx))
j.build(10)
j.save("test64.tree")
result2 = i.get_nns_by_item(0, 10)
i = AnnoyIndex(10, "angular")
i.load(f"./test64.tree")
# This might change in the future if we change the search algorithm, but in that case let's update the test
result3 = i.get_nns_by_item(0, 10)
result1, result2, result3
([0, 85, 42, 11, 54, 38, 53, 66, 19, 31], [0, 85, 42, 11, 54, 38, 53, 66, 19, 31], [0, 85, 42, 11, 54, 38, 26, 53, 66, 3])
Index(int f: Optional[int] = None, str metric: Optional[str] = None, int n_neighbors: int = 5, *, str on_disk_path: Optional[str] = None, bool prefault: bool = False, int seed: Optional[int] = None, int verbose: Optional[int] = None, int schema_version: int = 0, str dtype: str = 'float32', str index_dtype: str = 'int32', str wrapper_dtype: str = 'uint64', str random_dtype: str = 'uint64', int n_jobs: Optional[int] = None, **kwargs)
Annoy Approximate Nearest Neighbors Index.
This is a Cython-powered Python wrapper around the Annoy C++ library.
Parameters
----------
f : int or None, default=None
Embedding dimension. If 0 or None, dimension is inferred from first
vector added. Must be positive for immediate index construction.
metric : str or None, default=None
Distance metric. Supported values:
* "angular", "cosine" → cosine-like distance
* "euclidean", "l2", "lstsq" → L2 distance
* "manhattan", "l1", "cityblock", "taxicab" → L1 distance
* "dot", "@", ".", "dotproduct", "inner", "innerproduct" → negative dot product
* "hamming" → bitwise Hamming distance
If None and f > 0, defaults to "angular" with FutureWarning.
n_trees : int, default=-1
Number of trees to build. If -1, auto-selects based on dimension.
More trees = better accuracy but slower queries and more memory.
n_neighbors : int, default=5
Default number of neighbors for queries (estimator parameter).
on_disk_path : str or None, default=None
Path for on-disk building. If provided, enables memory-efficient
building for large indices.
prefault : bool, default=False
Whether to prefault pages when loading (may improve query latency).
seed : int or None, default=None
Random seed for tree construction. If None, uses Annoy's default.
Value 0 is treated as "use default" and emits a UserWarning.
verbose : int or None, default=None
Verbosity level (clamped to [-2, 2]). Level >= 1 enables logging.
schema_version : int, default=0
Pickle schema version marker (does not affect on-disk format).
dtype : str, default="float32"
Data type for embeddings. Supported values:
* "float16" / "half" / "fp16" → float16_t (16-bit half precision)
* "float32" / "single" / "fp32" → float (32-bit single precision, default)
* "float64" / "double" / "fp64" → double (64-bit double precision)
* "float128" / "quad" / "fp128" → float128_t (128-bit or long double)
All types are accessed via the double-precision widened bridge.
float16 values are narrowed on add_item; float128 gains no input precision
but benefits from higher-precision internal arithmetic on GCC/Clang.
index_dtype : str, default="int32"
Index identifier type. Supported values:
* "int8" → int8_t (max 127 items)
* "uint8" → uint8_t (max 255 items)
* "int16" → int16_t (max 32,767 items)
* "uint16" → uint16_t (max 65,535 items)
* "int32" → int32_t (max 2,147,483,647 items, default)
* "uint32" → uint32_t (max 4,294,967,295 items)
* "int64" → int64_t (max 9,223,372,036,854,775,807 items)
* "uint64" → uint64_t (max 18,446,744,073,709,551,615 items)
wrapper_dtype : str, default="uint64"
Internal wrapper type (e.g., for Hamming packing).
Future: "bool", "uint8", "uint32" etc.
random_dtype : str, default="uint64"
Random seed type. Currently only "uint64" supported.
n_jobs : int or None, default=None
Number of threads. If -1, uses all available cores.
**kwargs
Future extensibility
Attributes
----------
f : int
Embedding dimension (0 means "unset / lazy").
metric : str or None
Canonical metric name, or None if not configured.
ptr : AnnoyIndexInterface*
Pointer to C++ index (NULL if not constructed).
# State Indicators (Internal)
_f_valid : bool
True if f has been set (> 0)
_metric_valid : bool
True if metric has been configured
_index_constructed : bool
True if C++ index exists (ptr != NULL)
Notes
-----
* 32-bit integer (4 bytes) can store values from −2**31 to 2**31−1, roughly ±2 billion.
* 64-bit integer (8 bytes) can store values from −2**63 to 2**63−1, roughly ±9 quintillion.
Examples
--------
>>> index = Index(f=128, metric='angular', seed=42)
>>> index.add_item(0, [0.1] * 128)
>>> index.add_item(1, [0.2] * 128)
>>> index.build(n_trees=10)
>>> neighbors, distances = index.get_nns_by_item(0, n=5, include_distances=True)
set dtype:
>>> # Standard usage (float32)
>>> index = Index(f=128, metric='angular', dtype='float32')
>>>
>>> # High precision (float64)
>>> index = Index(f=128, metric='euclidean', dtype='float64')
>>>
>>> # Half precision (float16) - future
>>> # index = Index(f=128, metric='angular', dtype='float16')
import sys
import tempfile
from pathlib import Path
import scikitplot
# spotify/annoy Backward compatibility helper
sys.modules["annoy"] = scikitplot.annoy._annoy # now `import annoy` will resolve to your module
import annoy; print(annoy.AnnoyIndex.__doc__)
Index(int f: Optional[int] = None, str metric: Optional[str] = None, int n_neighbors: int = 5, *, str on_disk_path: Optional[str] = None, bool prefault: bool = False, int seed: Optional[int] = None, int verbose: Optional[int] = None, int schema_version: int = 0, str dtype: str = 'float32', str index_dtype: str = 'int32', str wrapper_dtype: str = 'uint64', str random_dtype: str = 'uint64', int n_jobs: Optional[int] = None, **kwargs)
Annoy Approximate Nearest Neighbors Index.
This is a Cython-powered Python wrapper around the Annoy C++ library.
Parameters
----------
f : int or None, default=None
Embedding dimension. If 0 or None, dimension is inferred from first
vector added. Must be positive for immediate index construction.
metric : str or None, default=None
Distance metric. Supported values:
* "angular", "cosine" → cosine-like distance
* "euclidean", "l2", "lstsq" → L2 distance
* "manhattan", "l1", "cityblock", "taxicab" → L1 distance
* "dot", "@", ".", "dotproduct", "inner", "innerproduct" → negative dot product
* "hamming" → bitwise Hamming distance
If None and f > 0, defaults to "angular" with FutureWarning.
n_trees : int, default=-1
Number of trees to build. If -1, auto-selects based on dimension.
More trees = better accuracy but slower queries and more memory.
n_neighbors : int, default=5
Default number of neighbors for queries (estimator parameter).
on_disk_path : str or None, default=None
Path for on-disk building. If provided, enables memory-efficient
building for large indices.
prefault : bool, default=False
Whether to prefault pages when loading (may improve query latency).
seed : int or None, default=None
Random seed for tree construction. If None, uses Annoy's default.
Value 0 is treated as "use default" and emits a UserWarning.
verbose : int or None, default=None
Verbosity level (clamped to [-2, 2]). Level >= 1 enables logging.
schema_version : int, default=0
Pickle schema version marker (does not affect on-disk format).
dtype : str, default="float32"
Data type for embeddings. Supported values:
* "float16" / "half" / "fp16" → float16_t (16-bit half precision)
* "float32" / "single" / "fp32" → float (32-bit single precision, default)
* "float64" / "double" / "fp64" → double (64-bit double precision)
* "float128" / "quad" / "fp128" → float128_t (128-bit or long double)
All types are accessed via the double-precision widened bridge.
float16 values are narrowed on add_item; float128 gains no input precision
but benefits from higher-precision internal arithmetic on GCC/Clang.
index_dtype : str, default="int32"
Index identifier type. Supported values:
* "int8" → int8_t (max 127 items)
* "uint8" → uint8_t (max 255 items)
* "int16" → int16_t (max 32,767 items)
* "uint16" → uint16_t (max 65,535 items)
* "int32" → int32_t (max 2,147,483,647 items, default)
* "uint32" → uint32_t (max 4,294,967,295 items)
* "int64" → int64_t (max 9,223,372,036,854,775,807 items)
* "uint64" → uint64_t (max 18,446,744,073,709,551,615 items)
wrapper_dtype : str, default="uint64"
Internal wrapper type (e.g., for Hamming packing).
Future: "bool", "uint8", "uint32" etc.
random_dtype : str, default="uint64"
Random seed type. Currently only "uint64" supported.
n_jobs : int or None, default=None
Number of threads. If -1, uses all available cores.
**kwargs
Future extensibility
Attributes
----------
f : int
Embedding dimension (0 means "unset / lazy").
metric : str or None
Canonical metric name, or None if not configured.
ptr : AnnoyIndexInterface*
Pointer to C++ index (NULL if not constructed).
# State Indicators (Internal)
_f_valid : bool
True if f has been set (> 0)
_metric_valid : bool
True if metric has been configured
_index_constructed : bool
True if C++ index exists (ptr != NULL)
Notes
-----
* 32-bit integer (4 bytes) can store values from −2**31 to 2**31−1, roughly ±2 billion.
* 64-bit integer (8 bytes) can store values from −2**63 to 2**63−1, roughly ±9 quintillion.
Examples
--------
>>> index = Index(f=128, metric='angular', seed=42)
>>> index.add_item(0, [0.1] * 128)
>>> index.add_item(1, [0.2] * 128)
>>> index.build(n_trees=10)
>>> neighbors, distances = index.get_nns_by_item(0, n=5, include_distances=True)
set dtype:
>>> # Standard usage (float32)
>>> index = Index(f=128, metric='angular', dtype='float32')
>>>
>>> # High precision (float64)
>>> index = Index(f=128, metric='euclidean', dtype='float64')
>>>
>>> # Half precision (float16) - future
>>> # index = Index(f=128, metric='angular', dtype='float16')
index = Index()
index
index.set_params(**index.get_params())
a = index.clone()
a
Index(10, metric= '.')
Index(10, metric= 'l1')
Index(10, metric= 'l2')
Index(10, metric= 'hamming')
import numpy as np
# Create index
index = Index(128)
# Add normalized vectors
for i in range(1000):
v = np.random.randn(128)
v = v / np.linalg.norm(v) # Normalize
index.add_item(i, v)
# Build and query
index.build(10)
neighbors, distances = index.get_nns_by_item(0, 10, include_distances=True)
neighbors, distances
/home/circleci/.pyenv/versions/3.11.15/lib/python3.11/site-packages/sphinx_gallery/gen_rst.py:801: FutureWarning: The default metric will be removed in a future version. Please pass metric='angular' explicitly.
exec(self.code, self.fake_main.__dict__)
([0, 266, 337, 364, 460, 877, 793, 162, 483, 786], [0.0, 1.2318352460861206, 1.2603861093521118, 1.2605830430984497, 1.2671748399734497, 1.2737023830413818, 1.281950831413269, 1.2857682704925537, 1.2968772649765015, 1.3004029989242554])
index.get_params()
{'f': 128, 'metric': 'angular', 'n_neighbors': 5, 'seed': None, 'verbose': None, 'on_disk_path': None, 'prefault': False, 'schema_version': 0, 'dtype': 'float32', 'index_dtype': 'int32', 'wrapper_dtype': 'uint64', 'random_dtype': 'uint64', 'n_jobs': 1}
with index.clone() as idx:
pprint(idx.get_state(), sort_dicts=False)
{'__version__': '1.0',
'params': {'f': 128,
'metric': 'angular',
'n_neighbors': 5,
'seed': None,
'verbose': None,
'on_disk_path': None,
'prefault': False,
'schema_version': 0,
'dtype': 'float32',
'index_dtype': 'int32',
'wrapper_dtype': 'uint64',
'random_dtype': 'uint64',
'n_jobs': 1},
'constructed': True,
'n_items': 0,
'n_trees': 0,
'index_data': None}
Total running time of the script: (0 minutes 0.028 seconds)
Related examples