#! /usr/bin/env python
#
# Tools to operate on files
# dysh_data = simple frontend to grab common dysh filenames
# fdr = Recursive data (file) finder
#
# command line usage:
#
# fdr [-r] [-m MAXFILES] file [path1 path2 ...]
#
import glob
import os
from pathlib import Path
import dysh.util as util
from dysh.log import logger
from dysh.util.download import from_url
from ..util import minimum_string_match
# the GBTIDL examples from https://gbtdocs.readthedocs.io/en/latest/how-tos/data_reduction/gbtidl.html
# getps: "data/ngc2415.fits" NGC2415 example=getps2 (TGBT21A_501_11)
# getfs: "data/TGBT22A_503_02.raw.vegas" W3_1 test= example=
# getsigref: "data/TGBT22A_503_02.raw.vegas" W3_1
# getps: "data/AGBT17A_404_01.raw.vegas" A123606 test=
#
# @todo convert everything to use Path()
# Path() cannot be used on input.... input needs to be a string
# fmt:off
# $DYSH/testdata @ todo normalize names with the example= cases
# ~300 MB
valid_dysh_test = {
"getps" : "AGBT05B_047_01/AGBT05B_047_01.raw.acs/", # NGC5291 old test1
"getps2" : "TGBT21A_501_11/TGBT21A_501_11.raw.vegas.fits", # NGC2415 one int, 540k
"getfs" : "TGBT21A_504_01/TGBT21A_504_01.raw.vegas/TGBT21A_504_01.raw.vegas.A.fits", # W3OH
"subbeamnod" : "TRCO_230413_Ka",
}
# http://www.gb.nrao.edu/dysh/example_data or /home/dysh/example_data or $DYSH_DATA/example_data
# @todo see if we want the staff training datasets in here
# ~410 GB
valid_dysh_example = {
"getps" : "positionswitch/data/AGBT05B_047_01/AGBT05B_047_01.raw.acs/AGBT05B_047_01.raw.acs.fits", # NGC5291 old test1
# Used in a lot of example notebooks:
# example/dataIO
# example/metadata_management
# example/positionswitch
# example/smoothing
# example/velocity frames
"getps0" : "positionswitch/data/AGBT05B_047_01/AGBT05B_047_01.raw.acs", # NGC5291 old test1
"getps2" : "onoff-L/data/TGBT21A_501_11.raw.vegas.fits", # NGC2415 - old getps
"getpslarge" : "onoff-L/data/TGBT21A_501_11.raw.vegas/", # NGC2415, NGC2782 etc. - total 15GB
"getfs" : "fs-L/data/AGBT20B_014_03.raw.vegas/AGBT20B_014_03.raw.vegas.A.fits",
"getfs2" : "frequencyswitch/data/TREG_050627/TREG_050627.raw.acs/", # W3OH # staff training FS
"subbeamnod" : "subbeamnod/data/AGBT13A_124_06/AGBT13A_124_06.raw.acs/", # vIIzw31 example/subbeamnod staff training SBN -- no signal?
"subbeamnod2": "subbeamnod-Ka/data/TRCO_230413_Ka.raw.vegas/TRCO_230413_Ka.raw.vegas.A.fits",
"nod" : "nod-KFPA/data/TGBT22A_503_02.raw.vegas/", # W3_1 example/nodding (scan 62,63)
# TGBT22A_503_02.raw.vegas # FS example in data_reduction (scan 64)
"align" : "mixed-fs-ps/data/TGBT24B_613_04.raw.vegas.trim.fits", # MESSIER32 example/align_spectra
"flagging" : "rfi-L/data/AGBT17A_404_01.tar.gz", # tar.gz not yet supported? A123606 example/flagging
"survey" : "hi-survey/data/AGBT04A_008_02.raw.acs/AGBT04A_008_02.raw.acs.fits", # example/hi-survey
"otf1" : "mapping-L/data/TGBT17A_506_11.raw.vegas/", # OTF L-band NGC6946
"otf2" : "AGBT21B_024_01", # OTF Argus NGC0001 (EDGE)
"otf3" : "AGBT21B_024_20", # OTF Argus NGC5954 (EDGE)
"otf4" : "mapping-Argus/data/TGBT22A_603_05.raw.vegas/", # OTF Argus DR21
}
# /home/dysh/acceptance_testing or $DYSH_DATA/acceptance_testing
# in acceptance_testing/data
# AGBT05B_047_01 AGBT15B_244_07 AGBT18A_503_02 AGBT19A_473_41 TGBT18A_500_06
# AGBT13A_240_03 AGBT16B_392_01 AGBT18B_014_02 AGBT19B_096_08 TGBT21A_501_10
# AGBT14B_480_06 AGBT17B_004_14 AGBT18B_354_03 AGBT20B_336_01 TREG_050627
# AGBT1s5B_228_08 AGBT17B_319_06 AGBT19A_080_01 AGBT22A_325_15 TSCAL_19Nov2015
# ~ 33 GB
valid_dysh_accept = {
"nod1" : "AGBT22A_325_15/AGBT22A_325_15.raw.vegas",
"nod2" : "TREG_050627/TREG_050627.raw.acs/TREG_050627.raw.acs.fits", # deprecated? W3OH example/frequencyswitch
"nod3" : "AGBT15B_244_07/AGBT15B_244_07.raw.vegas", # M82 examples/calseq
"nod4" : "TGBT18A_500_06/TGBT18A_500_06.raw.vegas",
"nod5" : "TSCAL_19Nov2015/TSCAL_19Nov2015.raw.acs/TSCAL_19Nov2015.raw.acs.fits", # deprecated
"nod6" : "AGBT17B_319_06/AGBT17B_319_06.raw.vegas",
"nod7" : "TGBT21A_501_10/TGBT21A_501_10.raw.vegas",
"nod8" : "AGBT19A_340_07/AGBT19A_340_07.raw.vegas",
"nod9" : "AGBT12A_076_05/AGBT12A_076_05.raw.acs",
"multismallsmall" : "AGBT20B_336_01/AGBT20B_336_01.raw.vegas", # multiple small FITS files (54M each), small flags files (7 lines)
"multihugesmall" : "AGBT14B_480_06/AGBT14B_480_06.raw.vegas", # multiple huge FITS files (3.5G each), small flags files (6 lines)
"multismallbig" : "AGBT23A_432_03/AGBT23A_432_03.raw.vegas", # multiple small FITS files (64M each), large flag files (20 lines)
"multibighuge" : "AGBT17B_319_06/AGBT17B_319_06.raw.vegas", # multiple large FITS files (733M each), huge flag files (102 lines)
}
# fmt: on
[docs]
def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None, gui=False):
r"""Resolves the filename within the dysh data system without the need
for an absolute path by passing mnemonics to any of four entry
points (`sdfits`, `!test`, `example`, `accept`).
Currently configured to work at GBO. For other sites users need to
configure a $DYSH_DATA directory, properly populated with
(symlinks to) directories as described below. Optionally, an
explicit `dysh_data` can be given, which overrides any possible
$DYSH_DATA environment (or configuration) that may exist.
Only one of the keywords `sdfits`, `!test`, `example`, `accept` can be
given to probe for data.
As an exception, if the first argument (`sdfits`) has an absolute
filename, it is passed unchecked.
gui mode is experimental and may disappear or re-implemented at a
later stage.
The locations of various dysh_data directory roots are presented in the following
Table, where $DYSH is the repo root for developers (this can be found using
`dysh.util.get_project_root`).
======== ============================= =============================
keyword location at GBO $DYSH_DATA root
======== ============================= =============================
sdfits= /home/sdfits $DYSH_DATA/sdfits
test= $DYSH/testdata $DYSH_DATA/testdata
example= /home/dysh/example_data $DYSH_DATA/example_data
accept= /home/dysh/acceptance_testing $DYSH_DATA/acceptance_testing
======== ============================= =============================
`!test` resolves to the same filename as the `util.get_project_testdata()` function
but it is otherwise only available for developers (the testdata directory is not available
if you `pip install dysh`).
If present, the $SDFITS_DATA directory is honored instead of the default for `sdfits`
and overrides the $DYSH_DATA directory.
Examples
--------
Using mnemonics
>>> fn = dysh_data(test='getps')
>>> fn = dysh_data(example='getfs')
Using full paths
>>> fn = dysh_data(example='onoff-L/data/TGBT21A_501_11.raw.vegas')
Using a project id
>>> fn = dysh_data('AGBT21B_024_54')
This will return `/home/sdfits/AGBT21B_024_54` at GBO, or `${DYSH_DATA}/sdfits/AGBT21B_024_54`
if the $DYSH_DATA environment variable is set.
Notes
-----
1) if $DYSH_DATA exist (and this is a new proposal), it will prepend
that to the argument of get_dysh_data() and check for existence
if $DYSH_DATA does not exist, but $SDFITS_DATA exists (a GBTIDL feature)
it will use that
2) if /home/dysh exists, it will prepend this and check for existence
this will keep GBO people happy. Offsite a symlink should also work.
3) if none of those gave a valid name, it will fall back to making a URL
by prepending http://www.gb.nrao.edu/dysh/ and using
from_url for as long we want to support that.
astropy caching is also an option
4) directories (names not ending on .fits) cannot be downloaded using from_url
5) configuration TBD
"""
# fmt:off
_url = "http://www.gb.nrao.edu/dysh/" # base of all things dysh
_example_data = "/home/dysh/public_html/example_data" # GBO direct access
_test_data = "/home/dysh/public_html/test_data" # not used ??
_accept_data = "/home/dysh/acceptance_testing/data" # not in public_html ??
# fmt:on
if type(dysh_data) is str:
dysh_data = Path(dysh_data)
def sdfits_offline(fn):
"""fn is an sdfits= file or directory that was shown to exist
If fn contains only one name
See also GBTOffline()
"""
if fn.is_file():
return fn
if not fn.is_dir():
print(f"{fn} is not a file nor a directory, dunno how to proceed")
return None
# find all fits files one level deep
ff = list(fn.glob("*/*.fits"))
if len(ff) == 0:
return fn
# ensure there is only a single parent
parents = []
for f in ff:
parents.append(f.parent)
parents = list(set(parents))
if len(parents) > 1:
print(f"{fn} does not contain a single fits tree: {parents}")
# @todo throw ? or return the first one?
return parents[0]
def use_gui(my_dir):
logger.debug(f"Using the GUI on {my_dir} is totally experimental.")
import tkinter as tk
from tkinter import filedialog
root = tk.Tk()
root.withdraw()
file_path = filedialog.askopenfilename(initialdir=my_dir)
# can currently only ask for files, use askdirectory() otherwise
return file_path
# 1. find out if there is a dysh_data (or use $DYSH_DATA, or a .dyshrc config?)
# - if present, API dysh_data is used
# - if present, $DYSH_DATA is used
# - if present, python_env is used
# - if all of this fails, assume we're at GBO (all via /home/dysh)
# - if that still fails, look at current working directory
# - throw!?
# ? e.g. dysh_data('foo.fits') -> sdfits='foo.fits'
if dysh_data == None and "DYSH_DATA" in os.environ: # noqa: E711
dysh_data = Path(os.environ["DYSH_DATA"])
logger.debug(f"DYSH_DATA: {dysh_data}")
# 2. Process whichever one of 'sdfits=', 'test=', 'example=', and 'accept=' is present (in that order)
# sdfits: the main place where GBO data reside
if sdfits is not None:
if sdfits == "!":
logger.warning("The GUI is experimental, it can only select a single fits file, no directories")
return use_gui(dysh_data)
if sdfits == "?" or sdfits == "*":
if "SDFITS_DATA" in os.environ:
dd = Path(os.environ["SDFITS_DATA"])
elif dysh_data == None: # noqa: E711
dd = Path("/home/sdfits")
else:
dd = Path(dysh_data) / "sdfits"
# @todo figure out listing of file OS agnostic
cmd = f"ls {dd}"
print("# dysh_data::sdfits")
print("# contents of", dd)
print("# -----------------")
os.system(cmd)
return None
if dysh_data is not None:
fn = dysh_data / Path("sdfits") / sdfits # normally user is using a private sdfits
if fn.exists():
return sdfits_offline(fn)
if "SDFITS_DATA" in os.environ:
fn = Path(os.environ["SDFITS_DATA"]) / sdfits
return sdfits_offline(fn)
fn = Path("/home/sdfits/") / sdfits # expected at GBO
if fn.exists():
return sdfits_offline(fn)
# print(f"could not handle sdfits={sdfits} yet")
return None
# test: this should also be allowed to use util.get_project_testdata() as well
if test is not None:
if test == "?":
print("# dysh_data::test")
print("# ---------------")
for k in valid_dysh_test.keys():
print(k, valid_dysh_test[k])
return None
my_test = minimum_string_match(test, list(valid_dysh_test.keys()))
if my_test is not None:
my_test = valid_dysh_test[my_test]
else:
my_test = test
if dysh_data is not None:
fn = dysh_data / "testdata" / my_test
if not fn.exists():
fn = util.get_project_testdata() / my_test
else:
fn = util.get_project_testdata() / my_test
logger.debug(f"final: {fn}")
if fn.exists(): # @todo this catches files and directories
return fn
print("Could not find", fn)
return None
# example: these can also obtain data via from_url (or perhaps astropy caching???)
if example is not None:
if example == "?":
print("# dysh_data::example")
print("# ------------------")
for k in valid_dysh_example.keys():
print(k, valid_dysh_example[k])
return None
my_example = minimum_string_match(example, list(valid_dysh_example.keys()))
if my_example is not None:
my_example = valid_dysh_example[my_example]
else:
my_example = example
if dysh_data is not None:
fn = dysh_data / "example_data" / my_example
if fn.exists():
return fn
print("Odd-1, did not find", fn)
if dysh_data is None and os.path.exists(_example_data):
fn = Path(_example_data) / my_example
if fn.exists():
return fn
print("Odd-2, did not find", fn)
# last resort, try getting it via from_url, but it will then be a local file in the current directory
url = _url + "/example_data/" + my_example
logger.info(f"url: {url}")
filename = url.split("/")[-1]
if not os.path.exists(filename):
print(f"Downloading {filename} from {url}")
try:
filename = from_url(url)
print(f"\nRetrieved {filename}")
except Exception as e:
print(f"\nFailing to retrieve example {filename} ")
print(e)
return None
else:
print(f"{filename} already downloaded")
return Path(filename)
# accept: acceptance_testing/data - from_url not recommended (does not work on multifile fits)
if accept is not None:
if accept == "?":
print("# dysh_data::accept")
print("# -----------------")
for k in valid_dysh_accept.keys():
print(k, valid_dysh_accept[k])
return None
my_accept = minimum_string_match(accept, list(valid_dysh_accept.keys()))
if my_accept is not None:
my_accept = valid_dysh_accept[my_accept]
else:
my_accept = accept
if dysh_data is not None:
fn = dysh_data / "acceptance_testing/data" / my_accept
if fn.exists():
return fn
print("Odd-1, did not find", fn)
if dysh_data is None and os.path.exists(_accept_data):
fn = Path(_accept_data) / my_accept
if fn.exists():
return fn
print("Odd-2, did not find", fn)
# last resort, try getting it via from_url, but it will then be a local file in the current directory
url = _url + "/acceptance_testing/data/" + my_accept
logger.debug(f"url: {url}")
filename = url.split("/")[-1]
if not os.path.exists(filename):
print(f"Downloading {filename} from {url}")
try:
filename = from_url(url)
print(f"\nRetrieved {filename}")
except Exception as e:
print(f"\nFailing to retrieve accept {filename}")
print(e)
return None
else:
print(f"{filename} already downloaded")
return Path(filename)
print("You have not given one of: sdfits=, test=, example=, accept=")
print("or use =? as argument to get a list of valid shortcuts")
print(f"DYSH_DATA = {dysh_data}")
return None
# def find_data_recursively(filename, path=None, recursive=False, wildcard=False, maxfiles=None):
[docs]
def fdr(filename, path=None, recursive=False, wildcard=False, maxfiles=None):
"""
Input:
filename - can be wildcard too (but see the wildcard option)
path - optional. can be : separated, can start with $ if envvar
recursive - recursively search: Default: False
wildcard - automatically wildcard the filename: Default not used
maxfiles - maximum number of files to be returns. Default: All
Returns:
list of found filenames, with maxfiles entries if applicable.
Note list could be empty.
Note if multiple paths are given, maxfiles is applied to each sublist
See also:
astropy's getdata ???
pdrptry.pdrutils.get_testdata()
astropy.utils.data.get_pkg_data_filenames
Examples:
fdr('ngc1234.fits') - this exact file!
fdr('*.fits') - all fits file in this directory
fdr('ngc1234.fits','/tmp') - this file in /tmp
fdr('*.fits','/tmp') - all fits files in /tmp
fdr('ngc1234.fits','$DYSH_DATA_PATH')
fdr('ngc1234.fits','$DYSH_DATA_PATH', True)
fdr('ngc1234.fits','$DYSH_DATA_PATH:/data/gbt')
"""
if os.path.exists(filename):
return [filename]
if path is None:
if wildcard:
fname = "*" + filename + "*"
else:
fname = filename
if recursive:
fname = "**/" + fname
logger.debug("# FNAME:", fname)
fn = glob.glob(fname, recursive=recursive)
if maxfiles is None:
retval = fn
else:
retval = fn[:maxfiles]
retval.sort()
else:
cwd0 = os.getcwd()
all = []
for p in path.split(":"):
if p[0] == "$":
if p[1:] in os.environ:
p = os.environ[p[1:]]
else:
print(f"# Warning: {p} not in the environment")
if os.path.exists(p):
os.chdir(p)
if wildcard:
fname = "*" + filename + "*"
else:
fname = filename
if recursive:
fname = "**/" + fname
fn = glob.glob(fname, recursive=recursive)
fn.sort()
if maxfiles is not None:
fn = fn[:maxfiles]
all = all + fn
else:
print(f"# Warning: directory {p} does not exist")
os.chdir(cwd0)
retval = all
return retval
[docs]
def main_cli():
import argparse
my_help = """
This script searches for files, optionally hierarchically,
much like the Unix 'find' program.
A difference is handling the --path directive, as multiple
colon separated paths can be given, much like the $PATH
environment variable in Unix.
The path variable can also expand $-environment variables.
"""
p = argparse.ArgumentParser(description=my_help, epilog="And so the search goes on....")
p.add_argument(
"-m",
"--maxfiles",
type=int,
default=None,
help="Maximum number of files to return [Default: all]",
)
p.add_argument("-c", "--count", action="store_true", help="add counter to filenames?")
p.add_argument("-w", "--wildcard", action="store_true", help="fully wildcard the filename embedded")
p.add_argument("-r", "--recursive", action="store_true", help="resursive?")
p.add_argument("-p", "--path", type=str, default=None, help="optional (colon separated) path(s)")
p.add_argument("filename", nargs="+", help="Filename(s) to search for")
args = p.parse_args()
logger.debug("#", args)
filename = args.filename
maxfiles = args.maxfiles
recursive = args.recursive
wildcard = args.wildcard
path = args.path
count = args.count
r = []
for f in filename:
r = r + fdr(f, path, recursive, wildcard, maxfiles)
# r.sort()
if count:
n = 1
for f in r:
print(n, f)
n = n + 1
else:
for f in r:
print(f)
if __name__ == "__main__":
main_cli()