Source code for hotelling.helpers
"""helpers.py."""
from io import BytesIO
import os
from warnings import warn
import pandas as pd
try:
import dask.dataframe as dd
except ImportError:
dd = False
try:
import sixel
except ImportError:
sixel = None
[docs]def savefig(plt):
"""savefig.
Allows displaying a matplotlib figure to the console terminal. This requires `pysixel` to be pip installed.
It also requires a terminal with `Sixel graphic` support, like `DEC` with graphic support, Linux `xterm` (started
with -ti 340), MLTerm (multilingual terminal, available on Windows, Linux etc).
This is called by the command line tool when using --output stdout and can also be used in an ipython session.
:param plt: matplotlib pyplot
:return:
"""
buf = BytesIO()
plt.savefig(buf)
buf.seek(0)
if sixel is None:
warn("No sixel module available. Please install pysixel")
writer = sixel.SixelWriter()
writer.draw(buf)
[docs]def load_df(filepath, server=None, dask=None, **kwargs):
"""load_df.
:param str filepath:
:param str server: head node for distributed cluster, ip address and port or hostname and port (localhost for local)
:param bool dask: if True, forces the use of dask,, even on smaller datasets
:param kwargs: to pass arguments to pandas `read_csv`
:return: dataframe
"""
try:
statinfo = os.stat(filepath) # file could be on hdfs or s3
filesize = statinfo.st_size
if filesize > 2 * 1024 ** 3: # 2GB, consider large
large = True
else:
large = dask
except (OSError, FileNotFoundError):
# doesn't exist, or is distributed.
filesize = 0
large = True
if server:
large = True # force it when server is specified
set_index = None
if dd and large: # dask is available
data_frame = dd
if "index_col" in kwargs.keys():
# for dask, we set index a different way, not in read_csv itself
index_col = kwargs.pop("index_col")
set_index = index_col
if server:
from distributed import Client
if server == "localhost":
client = Client() # "distributed", local
else:
client = Client(server) # distributed, head node
print(client.ncores())
else:
data_frame = pd
if set_index:
df = data_frame.read_csv(filepath, **kwargs).set_index(index_col, sorted=True, drop=True)
else:
df = data_frame.read_csv(filepath, **kwargs)
return df