1 год назад · dd9a723f54
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,129 @@
 
				+# Byte-compiled / optimized / DLL files
			
 
				+__pycache__/
			
 
				+*.py[cod]
			
 
				+*$py.class
			
 
				+
			
 
				+# C extensions
			
 
				+*.so
			
 
				+
			
 
				+# Distribution / packaging
			
 
				+.Python
			
 
				+build/
			
 
				+develop-eggs/
			
 
				+dist/
			
 
				+downloads/
			
 
				+eggs/
			
 
				+.eggs/
			
 
				+lib64/
			
 
				+parts/
			
 
				+sdist/
			
 
				+var/
			
 
				+wheels/
			
 
				+pip-wheel-metadata/
			
 
				+share/python-wheels/
			
 
				+*.egg-info/
			
 
				+.installed.cfg
			
 
				+*.egg
			
 
				+MANIFEST
			
 
				+
			
 
				+# PyInstaller
			
 
				+#  Usually these files are written by a python script from a template
			
 
				+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
			
 
				+*.manifest
			
 
				+*.spec
			
 
				+
			
 
				+# Installer logs
			
 
				+pip-log.txt
			
 
				+pip-delete-this-directory.txt
			
 
				+
			
 
				+# Unit test / coverage reports
			
 
				+htmlcov/
			
 
				+.tox/
			
 
				+.nox/
			
 
				+.coverage
			
 
				+.coverage.*
			
 
				+.cache
			
 
				+nosetests.xml
			
 
				+coverage.xml
			
 
				+*.cover
			
 
				+*.py,cover
			
 
				+.hypothesis/
			
 
				+.pytest_cache/
			
 
				+
			
 
				+# Translations
			
 
				+*.mo
			
 
				+*.pot
			
 
				+
			
 
				+# Django stuff:
			
 
				+*.log
			
 
				+local_settings.py
			
 
				+db.sqlite3
			
 
				+db.sqlite3-journal
			
 
				+
			
 
				+# Flask stuff:
			
 
				+instance/
			
 
				+.webassets-cache
			
 
				+
			
 
				+# Scrapy stuff:
			
 
				+.scrapy
			
 
				+
			
 
				+# Sphinx documentation
			
 
				+docs/_build/
			
 
				+
			
 
				+# PyBuilder
			
 
				+target/
			
 
				+
			
 
				+# Jupyter Notebook
			
 
				+.ipynb_checkpoints
			
 
				+
			
 
				+# IPython
			
 
				+profile_default/
			
 
				+ipython_config.py
			
 
				+
			
 
				+# pyenv
			
 
				+.python-version
			
 
				+
			
 
				+# pipenv
			
 
				+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
			
 
				+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
			
 
				+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
			
 
				+#   install all needed dependencies.
			
 
				+#Pipfile.lock
			
 
				+
			
 
				+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
			
 
				+__pypackages__/
			
 
				+
			
 
				+# Celery stuff
			
 
				+celerybeat-schedule
			
 
				+celerybeat.pid
			
 
				+
			
 
				+# SageMath parsed files
			
 
				+*.sage.py
			
 
				+
			
 
				+# Environments
			
 
				+.env
			
 
				+.venv
			
 
				+env/
			
 
				+venv/
			
 
				+ENV/
			
 
				+env.bak/
			
 
				+venv.bak/
			
 
				+
			
 
				+# Spyder project settings
			
 
				+.spyderproject
			
 
				+.spyproject
			
 
				+
			
 
				+# Rope project settings
			
 
				+.ropeproject
			
 
				+
			
 
				+# mkdocs documentation
			
 
				+/site
			
 
				+
			
 
				+# mypy
			
 
				+.mypy_cache/
			
 
				+.dmypy.json
			
 
				+dmypy.json
			
 
				+
			
 
				+# Pyre type checker
			
 
				+.pyre/
			
 
				+*pyscript*
			
--- a/PrepareData.py
+++ b/PrepareData.py
@@ -0,0 +1,265 @@
 
				+"""
			
 
				+obtain the data from neurovault and reformat as needed
			
 
				+for sharing
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import hashlib
			
 
				+import argparse
			
 
				+import glob
			
 
				+import shutil
			
 
				+import pandas
			
 
				+import json
			
 
				+
			
 
				+from neurovault_collection_downloader import cli
			
 
				+from utils import log_to_file
			
 
				+
			
 
				+# these are teams that are excluded
			
 
				+# from map analyses:
			
 
				+# used surface-based analysis ('1K0E', 'X1Z4')
			
 
				+# badly registered ( 'L1A8')
			
 
				+# used SVC analysis which was not allowed ('VG39')
			
 
				+# note that team 27TP is missing thresh files, so there
			
 
				+# should be a total of 5 with missing thresh
			
 
				+TEAMS_TO_SKIP = ["1K0E", "X1Z4", "L1A8", "VG39"]
			
 
				+
			
 
				+# incorrect unthresh values (very small) (5G9K)
			
 
				+# did not report t/z stats (16IN)
			
 
				+
			
 
				+TEAMS_TO_REMOVE_UNTHRESH = ["5G9K", "16IN"]
			
 
				+
			
 
				+
			
 
				+def get_download_dir(basedir, overwrite=False):
			
 
				+    download_dir = os.path.join(basedir, "neurovault_downloads")
			
 
				+    if overwrite and os.path.exists(download_dir):
			
 
				+        print("removing existing downloads directory")
			
 
				+        shutil.rmtree(download_dir)
			
 
				+
			
 
				+    if not os.path.exists(download_dir):
			
 
				+        os.mkdir(download_dir)
			
 
				+    return download_dir
			
 
				+
			
 
				+
			
 
				+def fix_trailing_slashes(s):
			
 
				+    """remove abritrary number of trailing slashes"""
			
 
				+    s = s.strip()  # first remove spaces
			
 
				+    while s[-1] == "/":
			
 
				+        s = s.strip("/")
			
 
				+    return s
			
 
				+
			
 
				+
			
 
				+def get_collection_ids(infile="team_id.xlsx", verbose=True):
			
 
				+    teaminfo = pandas.read_excel(infile)
			
 
				+    collectionID = {}
			
 
				+    for t in teaminfo.index:
			
 
				+        teamID = teaminfo.loc[t, "teamID"]
			
 
				+        if teamID in TEAMS_TO_SKIP:
			
 
				+            if verbose:
			
 
				+                print("skipping", teamID)
			
 
				+            continue
			
 
				+
			
 
				+        public_link = teaminfo.loc[t, "NV_collection_link"]
			
 
				+        public_link = fix_trailing_slashes(public_link)
			
 
				+
			
 
				+        collectionID[teamID] = os.path.basename(public_link)
			
 
				+        if verbose:
			
 
				+            print(teamID, collectionID[teamID], public_link)
			
 
				+        assert len(collectionID[teamID]) > 3
			
 
				+
			
 
				+    return collectionID
			
 
				+
			
 
				+
			
 
				+def download_collections(collectionIDs, download_dir, verbose=True, overwrite=False):
			
 
				+    teamIDs = sorted(collectionIDs.keys())
			
 
				+    failed_downloads = {}
			
 
				+    for teamID in teamIDs:
			
 
				+        if teamID in TEAMS_TO_SKIP:
			
 
				+            if verbose:
			
 
				+                print("skipping", teamID)
			
 
				+            continue
			
 
				+        try:
			
 
				+            if overwrite or not os.path.exists(os.path.join(download_dir, teamID)):
			
 
				+                if verbose:
			
 
				+                    print("fetching", teamID)
			
 
				+                cli.fetch_collection(collectionIDs[teamID], download_dir, f"{collectionIDs[teamID]}_{teamID}", exclude_tag=["sub", "cope"])
			
 
				+
			
 
				+
			
 
				+            elif verbose:
			
 
				+                print("using existing data for", teamID)
			
 
				+        except Exception as e:
			
 
				+            print("download failed for", teamID)
			
 
				+            print(e)
			
 
				+            failed_downloads[teamID] = e
			
 
				+    return failed_downloads
			
 
				+
			
 
				+
			
 
				+def check_downloads(completed_downloads):
			
 
				+    """
			
 
				+    check for complete downloads
			
 
				+    """
			
 
				+
			
 
				+    missing_files = {}
			
 
				+    for datadir in completed_downloads:
			
 
				+        teamID = os.path.basename(datadir)
			
 
				+        files = glob.glob(os.path.join(datadir, "*"))
			
 
				+        print("%s: found %d files" % (teamID, len(files)))
			
 
				+        # check for necessary files:
			
 
				+        filenames = [os.path.basename(f).lower() for f in files]
			
 
				+        missing_files[teamID] = []
			
 
				+
			
 
				+        for hyp in range(1, 10):
			
 
				+            for imgtype in ["thresh", "unthresh"]:
			
 
				+                targfile = "hypo%d_%s.nii.gz" % (hyp, imgtype)
			
 
				+                if targfile not in filenames:
			
 
				+                    missing_files[teamID].append(targfile)
			
 
				+        if len(missing_files[teamID]) > 0:
			
 
				+            print("missing %d files" % len(missing_files[teamID]))
			
 
				+            print(filenames)
			
 
				+    return missing_files
			
 
				+
			
 
				+
			
 
				+def log_data(download_dir, logfile, verbose=True):
			
 
				+    """record manifest and file hashes"""
			
 
				+    imgfiles = {}
			
 
				+    # traverse root directory, and list directories as dirs and files as files
			
 
				+    for root, _, files in os.walk(download_dir):
			
 
				+        path = root.split(os.sep)
			
 
				+        for file in files:
			
 
				+            if file.find(".nii.gz") < 0:
			
 
				+                # skip non-nifti files
			
 
				+                continue
			
 
				+            fname = os.path.join(root, file)
			
 
				+            filehash = hashlib.md5(open(fname, "rb").read()).hexdigest()
			
 
				+            short_fname = os.path.join("/".join(path[-2:]), file)
			
 
				+            imgfiles[short_fname] = filehash
			
 
				+            if verbose:
			
 
				+                print(short_fname, filehash)
			
 
				+            log_to_file(logfile, f"{short_fname} {filehash}")
			
 
				+
			
 
				+
			
 
				+def copy_renamed_files(collectionIDs, download_dir, logfile):
			
 
				+    """change file names based on info in images.json"""
			
 
				+    # setup target directory
			
 
				+    orig_dir = os.path.join(os.path.dirname(download_dir), "orig")
			
 
				+    if not os.path.exists(orig_dir):
			
 
				+        os.mkdir(orig_dir)
			
 
				+
			
 
				+    for teamID in collectionIDs:
			
 
				+        collectionID = f"{collectionIDs[teamID]}_{teamID}"
			
 
				+        collection_dir = os.path.join(download_dir, collectionID)
			
 
				+        fixed_dir = os.path.join(orig_dir, collectionID)
			
 
				+        if not os.path.exists(fixed_dir):
			
 
				+            os.mkdir(fixed_dir)
			
 
				+
			
 
				+        jsonfile = os.path.join(collection_dir, "images.json")
			
 
				+        if not os.path.exists(jsonfile):
			
 
				+            print("no json file for ", collectionID)
			
 
				+            continue
			
 
				+        with open(jsonfile) as f:
			
 
				+            image_info = json.load(f)
			
 
				+        for img in image_info:
			
 
				+            origname = os.path.basename(img["file"])
			
 
				+            # fix various issues with names
			
 
				+            newname = (
			
 
				+                img["name"].replace("tresh", "thresh").replace(" ", "_") + ".nii.gz"
			
 
				+            )
			
 
				+            newname = (
			
 
				+                newname.replace("hypo_", "hypo")
			
 
				+                .replace("uthresh", "unthresh")
			
 
				+                .replace("_LR", "")
			
 
				+            )
			
 
				+
			
 
				+            # skip unthresh images if necessary
			
 
				+            if newname.find("unthresh") > -1 and teamID in TEAMS_TO_REMOVE_UNTHRESH:
			
 
				+                continue
			
 
				+
			
 
				+            if origname.find("sub") > -1 or newname.find("thresh") <= -1:
			
 
				+                continue
			
 
				+            log_to_file(logfile, f"copying {collectionID}/{origname} to {collectionID}/{newname}")
			
 
				+
			
 
				+            shutil.copy(
			
 
				+                os.path.join(collection_dir, origname),
			
 
				+                os.path.join(fixed_dir, newname),
			
 
				+            )
			
 
				+    return orig_dir
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # parse arguments
			
 
				+    parser = argparse.ArgumentParser(description="Process NARPS data")
			
 
				+    parser.add_argument("-b", "--basedir", help="base directory")
			
 
				+    parser.add_argument(
			
 
				+        "-t", "--test", action="store_true", help="use testing mode (no processing)"
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "-l", "--leave_downloads", action="store_true", help="do not delete downloads"
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "-s", "--skip_download", action="store_true", help="use existing data"
			
 
				+    )
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    # set up base directory
			
 
				+    if args.basedir is not None:
			
 
				+        basedir = args.basedir
			
 
				+    elif "NARPS_BASEDIR" in os.environ:
			
 
				+        basedir = os.environ["NARPS_BASEDIR"]
			
 
				+        print("using basedir specified in NARPS_BASEDIR")
			
 
				+    else:
			
 
				+        basedir = "/data"
			
 
				+        print("using default basedir:", basedir)
			
 
				+
			
 
				+    if not os.path.exists(basedir):
			
 
				+        os.mkdir(basedir)
			
 
				+
			
 
				+    # set up logging
			
 
				+    logfile = os.path.join(basedir, "logs/neurovault_download.log")
			
 
				+    if not os.path.exists(os.path.dirname(logfile)):
			
 
				+        os.mkdir(os.path.dirname(logfile))
			
 
				+
			
 
				+    log_to_file(logfile, "Getting data from neurovault", flush=True)
			
 
				+
			
 
				+    collectionIDs = get_collection_ids()
			
 
				+    print("found", len(collectionIDs), "collections")
			
 
				+
			
 
				+    if args.skip_download:
			
 
				+        download_dir = get_download_dir(basedir, overwrite=False)
			
 
				+        assert os.path.exists(download_dir)
			
 
				+    else:
			
 
				+        download_dir = get_download_dir(basedir)
			
 
				+        print("downloading data to", basedir)
			
 
				+
			
 
				+        failed_downloads = download_collections(collectionIDs, download_dir)
			
 
				+
			
 
				+        if len(failed_downloads) > 0:
			
 
				+            print("failed downloads for %d teams" % len(failed_downloads))
			
 
				+            print(failed_downloads)
			
 
				+            for f in failed_downloads:
			
 
				+                log_to_file(logfile, f"{f}: {failed_downloads[f]}")
			
 
				+
			
 
				+    renaming_logfile = os.path.join(basedir, "logs/neurovault_renaming.log")
			
 
				+    orig_dir = copy_renamed_files(collectionIDs, download_dir, renaming_logfile)
			
 
				+
			
 
				+    completed_downloads = [
			
 
				+        i for i in glob.glob(os.path.join(orig_dir, "*")) if os.path.isdir(i)
			
 
				+    ]
			
 
				+    print("found %d completed downloads" % len(completed_downloads))
			
 
				+
			
 
				+    missing_files = check_downloads(completed_downloads)
			
 
				+
			
 
				+    has_missing_files = [
			
 
				+        teamID for teamID in missing_files if len(missing_files[teamID]) > 0
			
 
				+    ]
			
 
				+    log_to_file(
			
 
				+        logfile, "found %d teams with missing/misnamed files:" % len(has_missing_files)
			
 
				+    )
			
 
				+    log_to_file(logfile, " ".join(has_missing_files))
			
 
				+
			
 
				+    if not os.path.exists(os.path.join(basedir, "logs")):
			
 
				+        os.mkdir(os.path.join(basedir, "logs"))
			
 
				+
			
 
				+    manifest_file = os.path.join(basedir, "logs/MANIFEST.neurovault")
			
 
				+    log_data(download_dir, manifest_file)
			
 
				+
			
 
				+    if not args.leave_downloads:
			
 
				+        shutil.rmtree(download_dir)
			
--- a/README.md
+++ b/README.md
@@ -0,0 +1,20 @@
 
				+# Neurovault data
			
 
				+
			
 
				+Taken from https://github.com/poldrack/narps/tree/master/ImageAnalyses
			
 
				+
			
 
				+The data provided for download were obtained using from
			
 
				+[Neurovault](http://neurovault.org) using `code/PrepareData.py`. The tarball includes
			
 
				+files describing the provenance of the downloaded data (including MD5 hashes for
			
 
				+identity checking).
			
 
				+
			
 
				+Necessary pacakges are listed in the `requirements.txt`.
			
 
				+
			
 
				+Note it is also possible to grad all the data from the Zenodo archives that was
			
 
				+generated when the NARPS paper was realeased: https://zenodo.org/record/3528329/
			
 
				+
			
 
				+The `team_id.xlsx` is required for the script to run and lists all the different
			
 
				+teams and the link to their collection.
			
 
				+
			
 
				+```bash
			
 
				+python PrepareData.py -b $PWD
			
 
				+```
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,6 @@
 
				+pandas
			
 
				+requests
			
 
				+openpyxl
			
 
				+
			
 
				+# Russ has a few extra commit compare to the upstream repo
			
 
				+git+https://github.com/poldrack/neurovault_collection_downloader.git@f733fd88460230fc39b815574050082f319d7cb7
			
--- a/team_id.xlsx
+++ b/team_id.xlsx
@@ -0,0 +1 @@
 
				+.git/annex/objects/4Q/J8/MD5E-s15993--62d20e1f5b3310729f250a95e646291e.xlsx/MD5E-s15993--62d20e1f5b3310729f250a95e646291e.xlsx
			
--- a/utils.py
+++ b/utils.py
@@ -0,0 +1,25 @@
 
				+"""
			
 
				+utility functions for narps analysis
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+from datetime import datetime
			
 
				+
			
 
				+def log_to_file(
			
 
				+    fname, s, flush=False, add_timestamp=True, also_print=True, headspace=0
			
 
				+):
			
 
				+    """save string to log file"""
			
 
				+    if flush and os.path.exists(fname):
			
 
				+        os.remove(fname)
			
 
				+    if not isinstance(s, str):
			
 
				+        s = str(s)
			
 
				+    # add spacing before line
			
 
				+    if headspace > 0:
			
 
				+        s = os.linesep * headspace + s
			
 
				+    with open(fname, "a+") as f:
			
 
				+        if also_print:
			
 
				+            print(s)
			
 
				+        f.write(s + os.linesep)
			
 
				+        if flush and add_timestamp:
			
 
				+            f.write(datetime.isoformat(datetime.now()) + 2 * os.linesep)
			
 
				+
		`@@ -0,0 +1 @@`
		`+.git/annex/objects/4Q/J8/MD5E-s15993--62d20e1f5b3310729f250a95e646291e.xlsx/MD5E-s15993--62d20e1f5b3310729f250a95e646291e.xlsx`