# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division
import itertools
from petl.compat import string_types, text_type
from petl.util.base import Table, iterpeek
from petl.io.numpy import construct_dtype
[docs]def frombcolz(source, expression=None, outcols=None, limit=None, skip=0):
"""Extract a table from a bcolz ctable, e.g.::
>>> import petl as etl
>>>
>>> def example_from_bcolz():
... import bcolz
... cols = [
... ['apples', 'oranges', 'pears'],
... [1, 3, 7],
... [2.5, 4.4, .1]
... ]
... names = ('foo', 'bar', 'baz')
... ctbl = bcolz.ctable(cols, names=names)
... return etl.frombcolz(ctbl)
>>>
>>> example_from_bcolz() # doctest: +SKIP
+-----------+-----+-----+
| foo | bar | baz |
+===========+=====+=====+
| 'apples' | 1 | 2.5 |
+-----------+-----+-----+
| 'oranges' | 3 | 4.4 |
+-----------+-----+-----+
| 'pears' | 7 | 0.1 |
+-----------+-----+-----+
If `expression` is provided it will be executed by bcolz and only
matching rows returned, e.g.::
>>> tbl2 = etl.frombcolz(ctbl, expression='bar > 1') # doctest: +SKIP
>>> tbl2 # doctest: +SKIP
+-----------+-----+-----+
| foo | bar | baz |
+===========+=====+=====+
| 'oranges' | 3 | 4.4 |
+-----------+-----+-----+
| 'pears' | 7 | 0.1 |
+-----------+-----+-----+
.. versionadded:: 1.1.0
"""
return BcolzView(source, expression=expression, outcols=outcols,
limit=limit, skip=skip)
class BcolzView(Table):
def __init__(self, source, expression=None, outcols=None, limit=None,
skip=0):
self.source = source
self.expression = expression
self.outcols = outcols
self.limit = limit
self.skip = skip
def __iter__(self):
# obtain ctable
if isinstance(self.source, string_types):
import bcolz
ctbl = bcolz.open(self.source, mode='r')
else:
# assume bcolz ctable
ctbl = self.source
# obtain header
if self.outcols is None:
header = tuple(ctbl.names)
else:
header = tuple(self.outcols)
assert all(h in ctbl.names for h in header), 'invalid outcols'
yield header
# obtain iterator
if self.expression is None:
it = ctbl.iter(outcols=self.outcols, skip=self.skip,
limit=self.limit)
else:
it = ctbl.where(self.expression, outcols=self.outcols, skip=self.skip,
limit=self.limit)
for row in it:
yield row
[docs]def tobcolz(table, dtype=None, sample=1000, **kwargs):
"""Load data into a bcolz ctable, e.g.::
>>> import petl as etl
>>>
>>> def example_to_bcolz():
... table = [('foo', 'bar', 'baz'),
... ('apples', 1, 2.5),
... ('oranges', 3, 4.4),
... ('pears', 7, .1)]
... return etl.tobcolz(table)
>>>
>>> ctbl = example_to_bcolz() # doctest: +SKIP
>>> ctbl # doctest: +SKIP
ctable((3,), [('foo', '<U7'), ('bar', '<i8'), ('baz', '<f8')])
nbytes: 132; cbytes: 1023.98 KB; ratio: 0.00
cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0)
[('apples', 1, 2.5) ('oranges', 3, 4.4) ('pears', 7, 0.1)]
>>> ctbl.names # doctest: +SKIP
['foo', 'bar', 'baz']
>>> ctbl['foo'] # doctest: +SKIP
carray((3,), <U7)
nbytes := 84; cbytes := 511.98 KB; ratio: 0.00
cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0)
chunklen := 18724; chunksize: 524272; blocksize: 0
['apples' 'oranges' 'pears']
Other keyword arguments are passed through to the ctable constructor.
.. versionadded:: 1.1.0
"""
import bcolz
import numpy as np
it = iter(table)
peek, it = iterpeek(it, sample)
hdr = next(it)
# numpy is fussy about having tuples, need to make sure
it = (tuple(row) for row in it)
flds = list(map(text_type, hdr))
dtype = construct_dtype(flds, peek, dtype)
# create ctable
kwargs.setdefault('expectedlen', 1000000)
kwargs.setdefault('mode', 'w')
ctbl = bcolz.ctable(np.array([], dtype=dtype), **kwargs)
# fill chunk-wise
chunklen = sum(ctbl.cols[name].chunklen
for name in ctbl.names) // len(ctbl.names)
while True:
data = list(itertools.islice(it, chunklen))
data = np.array(data, dtype=dtype)
ctbl.append(data)
if len(data) < chunklen:
break
ctbl.flush()
return ctbl
[docs]def appendbcolz(table, obj, check_names=True):
"""Append data into a bcolz ctable. The `obj` argument can be either an
existing ctable or the name of a directory were an on-disk ctable is
stored.
.. versionadded:: 1.1.0
"""
import bcolz
import numpy as np
if isinstance(obj, string_types):
ctbl = bcolz.open(obj, mode='a')
else:
assert hasattr(obj, 'append') and hasattr(obj, 'names'), \
'expected rootdir or ctable, found %r' % obj
ctbl = obj
# setup
dtype = ctbl.dtype
it = iter(table)
hdr = next(it)
flds = list(map(text_type, hdr))
# check names match
if check_names:
assert tuple(flds) == tuple(ctbl.names), 'column names do not match'
# fill chunk-wise
chunklen = sum(ctbl.cols[name].chunklen
for name in ctbl.names) // len(ctbl.names)
while True:
data = list(itertools.islice(it, chunklen))
data = np.array(data, dtype=dtype)
ctbl.append(data)
if len(data) < chunklen:
break
ctbl.flush()
return ctbl