Reference

Cohort

Source code in app\minds\__init__.py
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
class Cohort:
    def __init__(self, data, output_dir):
        self.data = data
        self.output_dir = output_dir
        self.manifest_file = os.path.join(output_dir, "manifest.json")

    def generate_manifest(self):
        aggregator = Aggregator(self.data, self.output_dir)
        aggregator.generate_manifest()

    def download(self, threads: int = 4, include: list = None, exclude: list = None):
        if not os.path.exists(self.manifest_file):
            raise FileNotFoundError(
                f"No manifest file found in {self.output_dir}. Please run generate_manifest first."
            )

        self._download_gdc_files(threads, include=include, exclude=exclude)
        self._download_tcia_files(threads)

    def include(self, modalities):
        print(f"Only including {modalities} modalities in download")

    def stats(self):
        """Prints the statistics of the cohort in terms of file count and total size

        Returns:
            dict: A dictionary containing the statistics of the cohort
        """
        with open(self.manifest_file, "r") as f:
            manifest = json.load(f)

        stats_dict = {}
        for entry in manifest:
            for key, value in entry.items():
                if isinstance(value, list):
                    patient_size = 0
                    for file in value:
                        try:
                            patient_size += file["file_size"]
                        except Exception as e:
                            pass

                    if key not in stats_dict:
                        stats_dict[key] = {
                            "file_count": len(value),
                            "total_size": patient_size,
                        }
                    else:
                        stats_dict[key]["file_count"] += len(value)
                        stats_dict[key]["total_size"] += patient_size

        # Sort the dictionary by total size in descending order
        sorted_stats = sorted(
            stats_dict.items(), key=lambda x: x[1]["total_size"], reverse=True
        )

        console = Console()
        table = Table(show_header=True, header_style="bold green")
        table.add_column("Modality")
        table.add_column("File Count")
        table.add_column("Total Size")

        for key, value in sorted_stats:
            size = value["total_size"]
            if size > 1024 * 1024 * 1024:
                size = f"{size / (1024 * 1024 * 1024):.2f} GB"
            elif size > 1024 * 1024:
                size = f"{size / (1024 * 1024):.2f} MB"
            else:
                size = f"{size / 1024:.2f} KB"
            table.add_row(key, str(value["file_count"]), size)

        console.print(table)
        return dict(sorted_stats)

    def _download_gdc_files(self, threads, include=None, exclude=None):
        gdc_downloader = GDCFileDownloader(
            self.output_dir, MAX_WORKERS=threads, include=include, exclude=exclude
        )
        gdc_downloader.process_cases()

    def _download_tcia_files(self, threads, include=None, exclude=None):
        tcia_downloader = TCIAFileDownloader(
            self.output_dir, MAX_WORKERS=threads, include=include, exclude=exclude
        )
        tcia_downloader.process_cases()

stats()

Prints the statistics of the cohort in terms of file count and total size

Returns:
  • dict

    A dictionary containing the statistics of the cohort

Source code in app\minds\__init__.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
def stats(self):
    """Prints the statistics of the cohort in terms of file count and total size

    Returns:
        dict: A dictionary containing the statistics of the cohort
    """
    with open(self.manifest_file, "r") as f:
        manifest = json.load(f)

    stats_dict = {}
    for entry in manifest:
        for key, value in entry.items():
            if isinstance(value, list):
                patient_size = 0
                for file in value:
                    try:
                        patient_size += file["file_size"]
                    except Exception as e:
                        pass

                if key not in stats_dict:
                    stats_dict[key] = {
                        "file_count": len(value),
                        "total_size": patient_size,
                    }
                else:
                    stats_dict[key]["file_count"] += len(value)
                    stats_dict[key]["total_size"] += patient_size

    # Sort the dictionary by total size in descending order
    sorted_stats = sorted(
        stats_dict.items(), key=lambda x: x[1]["total_size"], reverse=True
    )

    console = Console()
    table = Table(show_header=True, header_style="bold green")
    table.add_column("Modality")
    table.add_column("File Count")
    table.add_column("Total Size")

    for key, value in sorted_stats:
        size = value["total_size"]
        if size > 1024 * 1024 * 1024:
            size = f"{size / (1024 * 1024 * 1024):.2f} GB"
        elif size > 1024 * 1024:
            size = f"{size / (1024 * 1024):.2f} MB"
        else:
            size = f"{size / 1024:.2f} KB"
        table.add_row(key, str(value["file_count"]), size)

    console.print(table)
    return dict(sorted_stats)

build_cohort(output_dir, query=None, gdc_cohort=None, manifest=None)

Builds a cohort based on a query or a GDC cohort file and returns a Cohort object.

Source code in app\minds\__init__.py
149
150
151
152
153
154
155
156
157
158
159
160
161
def build_cohort(output_dir, query=None, gdc_cohort=None, manifest=None):
    """Builds a cohort based on a query or a GDC cohort file and returns a Cohort object."""
    if query:
        cohort_data = db.get_minds_cohort(query)
    elif gdc_cohort:
        cohort_data = db.get_gdc_cohort(gdc_cohort)
    else:
        raise ValueError("Either a query or a gdc_cohort file must be provided")

    cohort = Cohort(cohort_data, output_dir)
    if manifest is None:
        cohort.generate_manifest()
    return cohort

get_columns(table)

Get the list of columns in a table

Parameters

table : str The name of the table

Returns

list A list of columns in the table

Source code in app\minds\__init__.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def get_columns(table):
    """Get the list of columns in a table

    Parameters
    ----------
    table : str
        The name of the table

    Returns
    -------
    list
        A list of columns in the table
    """
    return db.get_columns(table)

get_tables()

Get the list of tables in the database

Returns

list A list of tables in the database

Source code in app\minds\__init__.py
18
19
20
21
22
23
24
25
26
def get_tables():
    """Get the list of tables in the database

    Returns
    -------
    list
        A list of tables in the database
    """
    return db.get_tables()

query(query)

Query the database and return the result as a pandas dataframe

Parameters

query_string : str The query string to be executed on the database

Returns

pandas.DataFrame The result of the query

Source code in app\minds\__init__.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def query(query):
    """Query the database and return the result as a pandas dataframe

    Parameters
    ----------
    query_string : str
        The query string to be executed on the database

    Returns
    -------
    pandas.DataFrame
        The result of the query
    """
    return db.execute(query)