Source code for embfile.core.reader
import abc
import numpy
from overrides import overrides
from embfile.errors import IllegalOperation
from embfile.types import DType, VectorType
[docs]class EmbFileReader(abc.ABC):
"""
*(Abstract class)* Iterator that yields a word at each step and read the corresponding vector
only if the lazy property ``current_vector`` is accessed.
**Iteration model.** The iteration model is not the most obvious: each iteration step doesn't
return a word vector pair. Instead, for performance reasons, at each step a reader returns
the next word. To read the vector for the current word, you must access the (lazy) property
:meth:`current_vector`::
with emb_file.reader() as reader:
for word in reader:
if word in my_vocab:
word2vec[word] = reader.current_vector
When you access :meth:`~embfile.core.EmbFileCursor.current_vector` for the first time,
the vector data is read/parsed and a vector is created; the vector remains
accessible until a new word is read.
**Creation.** Creating a reader usually implies the creation of a file object. That's why
``EmbFileReader`` implements the ``ContextManager`` interface so that you can use it inside
a ``with`` clause. Nonetheless, a ``EmbFile`` keeps track of all its open readers and close them
automatically when it is closed.
Args:
out_dtype:
all the vectors will be converted to this dtype before being returned
Attributes:
out_dtype (numpy.dtype):
all the vectors will be converted to this data type before being returned
"""
def __init__(self, out_dtype: DType):
self.out_dtype = numpy.dtype(out_dtype)
self._closed = False
self._current_vector = None
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
def __iter__(self):
return self
def __next__(self) -> str:
return self.next_word()
def _raise_illegal_operation(self, *args, **kwargs):
raise IllegalOperation('attempted to use a closed reader')
# noinspection PyAttributeOutsideInit
[docs] def close(self) -> None:
""" Closes the reader """
if self._closed:
return
self._close()
self._closed = True
self.reset = self.next_word = self._raise_illegal_operation # type: ignore
@abc.abstractmethod
def _close(self) -> None:
""" *(Abstract method)* Closes the reader """
[docs] @abc.abstractmethod
def reset(self) -> None:
""" *(Abstract method)* Brings back the reader to the first word vector pair """
[docs] @abc.abstractmethod
def next_word(self) -> str:
""" *(Abstract method)* Reads and returns the next word in the file. """
[docs] @abc.abstractmethod
def current_vector(self) -> VectorType:
""" *(Abstract method)* The vector for the current word (i.e. the last word read).
If accessed before any word has been read, it raises ``IllegalOperation``.
The dtype of the returned vector is cls.out_dtype. """
[docs]class AbstractEmbFileReader(EmbFileReader, abc.ABC):
"""
*(Abstract class)* Facilitates the implementation of a :class:`EmbFileReader`, especially for a
file that stores a word and its vector nearby in the file (txt and bin formats), though it can
be used for other kind of formats as well if it looks convenient. It:
- keeps track of whether the reader is pointing to a word or a vector and skips the vector
when it is not requested during an iteration
- caches the current vector once it is read
Sub-classes must implement:
.. autosummary::
_read_word
_read_vector
_skip_vector
_close
.. automethod:: _read_word
.. automethod:: _read_vector
.. automethod:: _skip_vector
.. automethod:: _reset
.. automethod:: _close
"""
def __init__(self, out_dtype: DType):
super().__init__(out_dtype)
self._closed = False
self._pointing_to_vector = False # True if the file reader is pointing to vector data
self._current_vector = None
[docs] @abc.abstractmethod
def _reset(self) -> None:
""" *(Abstract method)* Resets the reader """
[docs] @overrides
def reset(self) -> None: # type: ignore
""" Brings back the reader to the beginning of the file """
self._reset()
self._pointing_to_vector = False
self._current_vector = None
[docs] @abc.abstractmethod
def _read_word(self) -> str:
"""
*(Abstract method)* Reads a word assuming the next thing to read in the file is a word.
It must raise StopIteration if there's not another word to read.
"""
[docs] @abc.abstractmethod
def _read_vector(self) -> VectorType:
"""
*(Abstract method)* Reads the vector for the last word read. This method is never called if
no word has been read or at the end of file. It is called at most time per word.
"""
[docs] @abc.abstractmethod
def _skip_vector(self) -> None:
"""
*(Abstract method)* Called when we want to read the next word without loading
the vector for the current word. For some formats, it may be empty.
"""
[docs] @overrides
def next_word(self) -> str: # type: ignore
""" Reads and returns the next word in the file. """
if self._pointing_to_vector:
self._skip_vector()
self._pointing_to_vector = False
word = self._read_word() # this can raise StopIteration
self._pointing_to_vector = True
self._current_vector = None
return word
@property
def current_vector(self) -> VectorType:
"""
The vector associated to the current word (i.e. the last word read).
If accessed before any word has been read, it raises ``IllegalOperation``.
"""
if self._current_vector is None:
if not self._pointing_to_vector:
raise IllegalOperation('you called current_vector before reading any word')
self._current_vector = self._read_vector()
self._pointing_to_vector = False
return self._current_vector