Source code for embfile.formats.vvm

__all__ = ['VVMEmbFile', 'VVMEmbFileReader']

import io
import json
import os
import shutil
import tarfile
import tempfile
import warnings
from collections import OrderedDict
from pathlib import Path
from typing import IO, Iterable, Optional, Tuple

import numpy
from overrides import overrides

from embfile._utils import MappingComposition, coalesce, maybe_progbar, noop, progbar
from embfile.compression import open_file
from embfile.core._file import (DEFAULT_VERBOSE, EmbFile, check_vector_size, glance_first_element,
                                warn_if_wrong_vocab_size)
from embfile.core.loaders import RandomAccessLoader, Word2Vector
from embfile.core.reader import AbstractEmbFileReader
from embfile.types import DType, PairsType, PathType, VectorType

VOCAB_FILENAME = 'vocab.txt'
VECTORS_FILENAME = 'vectors.bin'
META_FILENAME = 'meta.json'

_TAR_COMPRESSIONS = {'gz', 'bz2', 'xz'}
DEFAULT_EXTENSION = '.vvm'


#: Default text encoding
DEFAULT_ENCODING = 'utf-8'

#: Default vector data type (little-endian single-precision floating point numbers)
DEFAULT_DTYPE = numpy.dtype('<f4')


[docs]class VVMEmbFileReader(AbstractEmbFileReader): """ :class:`~embfile.core.EmbFileReader` for the vvm format. """ def __init__(self, file, vectors_file) -> None: super().__init__(file.out_dtype) self.file = file self.vector_size = file.vector_size self.dtype = file.dtype self._vectors_file = vectors_file self._vector_size_in_bytes = file.vector_size * file.dtype.itemsize self._words_iter = iter(file.vocab) def _close(self) -> None: self._vectors_file.close() def _reset(self) -> None: self._words_iter = iter(self.file.vocab) self._vectors_file.seek(0) def _read_word(self) -> str: return next(self._words_iter) def _read_vector(self) -> VectorType: vec_bytes = self._vectors_file.read(self._vector_size_in_bytes) vector = numpy.frombuffer(vec_bytes, dtype=self.dtype) return numpy.asarray(vector, dtype=self.out_dtype) def _skip_vector(self) -> None: self._vectors_file.seek(self._vector_size_in_bytes, io.SEEK_CUR)
class _VectorsFileWrapper: """ Wraps vectors.bin file and allows to read each vector by index """ def __init__(self, binary_file, dtype: DType, out_dtype: DType, vector_size: int): self.file = binary_file self.dtype = numpy.dtype(dtype) self.out_dtype = numpy.dtype(out_dtype or dtype) self._vector_size_in_bytes = vector_size * self.dtype.itemsize def __getitem__(self, index) -> VectorType: """ Returns a vector by its index in the file (random access). """ file = self.file file.seek(index * self._vector_size_in_bytes, io.SEEK_SET) vec = numpy.frombuffer(file.read(self._vector_size_in_bytes), dtype=self.dtype) return vec.astype(self.out_dtype) def close(self): self.file.close()
[docs]class VVMEmbFile(EmbFile, Word2Vector): """ (Custom format) A tar file storing vocabulary, vectors and metadata in 3 separate files. Features: #. the vocabulary can be loaded very quickly (with no need for an external vocab file) and it is loaded in memory when the file is opened; #. direct access to vectors - by word using :meth:`__getitem__` (e.g. ``file['hello']``) - by index using :meth:`vector_at` #. implements :meth:`__contains__` (e.g. ``'hello' in file``) #. all the information needed to open the file are stored in the file itself **Specifics.** The files contained in a VVM file are: - *vocab.txt*: contains each word on a separate line - *vectors.bin*: contains the vectors in binary format (concatenated) - *meta.json*: must contain (at least) the following fields: - *vocab_size*: number of word vectors in the file - *vector_size*: length of a word vector - *encoding*: text encoding used for vocab.txt - *dtype*: vector data type string (notation used by numpy) Attributes: path encoding dtype out_dtype verbose vocab (OrderedDict[str, int]) map each word to its index in the file """ DEFAULT_EXTENSION = '.vvm' def __init__(self, path: PathType, out_dtype: Optional[DType] = None, verbose: int = DEFAULT_VERBOSE): """ Args: path: out_dtype: verbose: """ super().__init__(path, out_dtype, verbose=verbose) if not tarfile.is_tarfile(path): raise ValueError('not a valid vvm file: %s' % path) self._tar = tar = tarfile.open(path) def _open_tar_member(filename) -> IO[bytes]: member = tar.extractfile(filename) if member is None: raise ValueError('missing file inside the archive: ' + filename) return member # Read metadata with _open_tar_member(META_FILENAME) as meta_file: metadata = json.load(meta_file) self.metadata = metadata self.vocab_size = metadata['vocab_size'] # type: int self.vector_size = metadata['vector_size'] self.encoding = metadata['encoding'] self.dtype = metadata['dtype'] = numpy.dtype(metadata['dtype']) if self.out_dtype is None: # type: ignore self.out_dtype = self.dtype # Extract vectors.bin self._vectors_wrapper = self._get_vectors_file_wrapper() self._vector_size_in_bytes = self.dtype.itemsize * self.vector_size # Load the vocabulary with _open_tar_member(VOCAB_FILENAME) as vocab_file: vocab_reader = io.TextIOWrapper(vocab_file, encoding=self.encoding) lines_iterable = maybe_progbar(vocab_reader, yes=verbose, total=self.vocab_size, desc='Loading the vocabulary') self.vocab = OrderedDict((line[:-1], index) for index, line in enumerate(lines_iterable)) def _get_vectors_file_wrapper(self) -> '_VectorsFileWrapper': vectors_file = self._tar.extractfile(VECTORS_FILENAME) return _VectorsFileWrapper(vectors_file, self.dtype, self.out_dtype, self.vector_size) @overrides def _close(self) -> None: self._vectors_wrapper.close() self._tar.close() @overrides def _reader(self) -> VVMEmbFileReader: vectors_file = self._tar.extractfile(VECTORS_FILENAME) return VVMEmbFileReader(self, vectors_file) @overrides def _loader( self, words: Iterable[str], missing_ok=True, verbose: Optional[bool] = None ) -> RandomAccessLoader: word2index = self.vocab index2vec = self._get_vectors_file_wrapper() word2vec = MappingComposition(word2index, index2vec) return RandomAccessLoader(words, word2vec=word2vec, word2index=self.vocab.__getitem__, missing_ok=missing_ok, verbose=coalesce(verbose, self.verbose), close_hook=lambda: index2vec.close())
[docs] @overrides def words(self) -> Iterable[str]: return self.vocab.keys()
[docs] def __contains__(self, word: str) -> bool: """ Returns True if the file contains a vector for ``word`` """ return word in self.vocab
[docs] def vector_at(self, index: int) -> VectorType: """ Returns a vector by its index in the file (random access). """ if index >= self.vocab_size or index < -self.vocab_size: raise IndexError(index) if index < 0: index += self.vocab_size return self._vectors_wrapper[index]
[docs] def __getitem__(self, word) -> VectorType: """ Returns the vector associated to a word (random access to file). """ index = self.vocab[word] return self._vectors_wrapper[index]
@classmethod def _create(cls, out_path: Path, word_vectors: Iterable[Tuple[str, VectorType]], vector_size: int, vocab_size: Optional[int], compression: Optional[str] = None, verbose: bool = True, encoding: str = DEFAULT_ENCODING, dtype: Optional[DType] = None) -> Path: echo = print if verbose else noop if not dtype: (_, vector), word_vectors = glance_first_element(word_vectors) dtype = vector.dtype else: dtype = numpy.dtype(dtype) # Write everything in a temporary directory and then pack them into a tar file tempdir = Path(tempfile.mkdtemp()) vocab_tmp_path = tempdir / VOCAB_FILENAME vectors_tmp_path = tempdir / VECTORS_FILENAME meta_tmp_path = tempdir / META_FILENAME with open(vocab_tmp_path, 'wt', encoding=encoding) as vocab_file, \ open(vectors_tmp_path, 'wb') as vectors_file: # noqa desc = 'Generating {} and {} file'.format(VOCAB_FILENAME, VECTORS_FILENAME) i = -1 for i, (word, vector) in progbar(enumerate(word_vectors), verbose, desc=desc, total=vocab_size): if '\n' in word: raise ValueError("the word number %d contains one or more newline characters: " "%r" % (i, word)) vocab_file.write(word) vocab_file.write('\n') check_vector_size(i, vector, vector_size) vectors_file.write(numpy.asarray(vector, dtype).tobytes()) actual_vocab_size = i + 1 warn_if_wrong_vocab_size(vocab_size, actual_vocab_size, extra_info='the actual size will be written in meta.json') vocab_size = actual_vocab_size echo('Writing {}...'.format(META_FILENAME)) metadata = { "vocab_size": vocab_size, "vector_size": vector_size, "dtype": dtype.str, "encoding": encoding } with open(meta_tmp_path, 'w') as meta_file: json.dump(metadata, meta_file, indent=2) if not compression: tar_path = out_path tar_mode = 'w' elif compression in _TAR_COMPRESSIONS: tar_path = out_path tar_mode = 'w:' + compression else: warnings.warn('A VVM file is just a TAR file; you should compress it using ' 'one the formats directly supported by tarfile ({}). ' 'Using another compression format will require me to create a ' 'temporary uncompressed TAR file first, doubling the required time!') tar_path = out_path.with_suffix(out_path.suffix + '.tmp') tar_mode = 'w' echo('Packing all the files together') with tarfile.open(tar_path, tar_mode) as tar_file: tar_file.add(str(vocab_tmp_path), VOCAB_FILENAME) tar_file.add(str(vectors_tmp_path), VECTORS_FILENAME) tar_file.add(str(meta_tmp_path), META_FILENAME) shutil.rmtree(tempdir) if compression and compression not in _TAR_COMPRESSIONS: echo("Compressing to %s file: %s" % (compression, out_path)) with open_file(out_path, 'wb', compression=compression) as compressed_file: with open(tar_path, 'rb') as non_compressed_file: shutil.copyfileobj(non_compressed_file, compressed_file) os.remove(tar_path) return out_path
[docs] @classmethod def create(cls, out_path: PathType, word_vectors: PairsType, vocab_size: Optional[int] = None, compression: Optional[str] = None, verbose: bool = True, overwrite: bool = False, encoding: str = DEFAULT_ENCODING, dtype: Optional[DType] = None) -> None: """ Format-specific arguments are encoding and dtype. Being VVM a tar file, you should use a compression supported by the tarfile package (avoid zip): gz, bz2 or xz. See :meth:`~embfile.core.file.EmbFile.create` for more doc. """ super().create(out_path, word_vectors, vocab_size, compression, verbose, overwrite, encoding=encoding, dtype=dtype)
[docs] @classmethod def create_from_file(cls, source_file: 'EmbFile', out_dir: Optional[PathType] = None, out_filename: Optional[str] = None, vocab_size: Optional[int] = None, compression: Optional[str] = None, verbose: bool = True, overwrite: bool = False, encoding: str = DEFAULT_ENCODING, dtype: Optional[DType] = None) -> Path: """ Format-specific arguments are encoding and dtype. Being VVM a tar file, you should use a compression supported by the tarfile package (avoid zip): gz, bz2 or xz. See :meth:`~embfile.core.file.EmbFile.create_from_file` for more doc. """ return super().create_from_file( source_file, out_dir, out_filename, vocab_size, compression, verbose, overwrite, encoding=encoding, dtype=dtype)