Source code for jupyter_analysis_tools.datalocations
# -*- coding: utf-8 -*-
# datalocations.py
import glob
import os
import shutil
import tempfile
from pathlib import Path
from .utils import indent, isList
[docs]
def getWorkDir(workDir=None, skip=False):
"""Find a local work dir for temporary files, created during analysis.
The default is *$HOME/data*."""
if skip: # stay in the current directory if desired
return os.path.abspath(".")
if not workDir or not len(workDir):
workDir = Path.home() / "data"
else:
workDir = Path(workDir).resolve()
if not workDir.is_dir():
os.mkdir(workDir)
print("Using '{}' as working directory.".format(workDir))
return workDir
[docs]
def prepareWorkDir(workDir, srcDir, useExisting=False):
"""Create a temporary working directory and copy
the input data (series) to it if not already present."""
# source dir has to exist
if not os.path.isdir(srcDir):
raise RuntimeError("Provided source directory '{}' not found!".format(srcDir))
srcDir = os.path.realpath(srcDir)
# no separate work dir requested?
if os.path.samefile(workDir, os.getcwd()):
print("Working in current directory '{}'.".format(os.getcwd()))
return srcDir # nothing to do
prefix = os.path.basename(srcDir) + "_"
if useExisting: # use an existing work dir, avoid copying
dirs = glob.glob(os.path.join(workDir, prefix + "*"))
if len(dirs):
return dirs[0] # use the first match
print("No existing work dir found, creating a new one.")
# copy all data from src dir to a newly created work dir
workDir = tempfile.mkdtemp(dir=workDir, prefix=prefix)
print("Copying data to {}:".format(workDir))
for dn in os.listdir(srcDir):
srcPath = os.path.join(srcDir, dn)
dstPath = os.path.join(workDir, dn)
if os.path.isdir(srcPath):
shutil.copytree(srcPath, dstPath)
print(indent, dn)
if os.path.isfile(srcPath):
shutil.copy(srcPath, dstPath)
print(indent, dn)
print("Done preparing work dir.")
return workDir
[docs]
def printFileList(fnlst, numParts=2, limit=20):
def printlst(lst):
return [print(indent, fn) for fn in lst]
def shorten(lst):
return [os.path.join(*Path(fn).parts[-numParts:]) for fn in lst]
if len(fnlst) > limit:
printlst(shorten(fnlst[:3]))
print(indent, "[...]")
printlst(shorten(fnlst[-3:]))
else:
printlst(shorten(fnlst))
[docs]
def getDataDirs(dataDir, noWorkDir=False, reuseWorkDir=True, workDir=None):
"""Create a local work dir with a copy of the input data and for storing the results.
(Data might reside in synced folders which creates massive traffic once batch processing
results get replaced repeately.)
Parameters
----------
noWorkDir: bool
False: Copy input data to a new working dir (default),
True: otherwise, use data where it is.
reuseWorkDir: bool
False: Create a new working dir each time,
True: reuse the work dir if it exists already (default).
Returns
-------
A list of absolute directory paths.
"""
basedir = getWorkDir(workDir=workDir, skip=noWorkDir)
workDir = prepareWorkDir(basedir, dataDir, useExisting=reuseWorkDir)
print("Entering '{}':".format(workDir))
dirs = sorted([dn for dn in Path(workDir).iterdir() if dn.is_dir()])
dirs.append(Path(workDir))
# [print(os.path.join(*dn.parts[-2:])) for dn in dirs]
printFileList(dirs, numParts=1)
return dirs
[docs]
def getDataFiles(dataDirs, include=None, exclude=None):
"""Return absolute file paths from given directories."""
def getFiles(dn, include=None):
if not include:
include = "*"
if not isList(include):
include = (include,)
return [path for inc in include for path in glob.glob(os.path.join(dn, inc))]
if not exclude:
exclude = ()
if not isList(exclude):
exclude = (exclude,)
if not isList(dataDirs):
dataDirs = (dataDirs,)
files = [
fn
for dn in dataDirs
for fn in getFiles(dn, include)
if not any([(ex in fn) for ex in exclude])
]
print("{} files to be analyzed in subdirectories.".format(len(files)))
return sorted(files)