Reference

Cohort

Cohort class for managing and processing medical data.

Attributes:
  • data (any) –

    The input data for the cohort.

  • output_dir (str) –

    The directory where output files will be stored.

  • manifest_file (str) –

    The path to the manifest file in the output directory.

Methods: init(data, output_dir): Initializes the Cohort instance with data and output directory. generate_manifest(): Generates a manifest file for the cohort using the Aggregator class. download(threads=4, include=None, exclude=None): Downloads files specified in the manifest. Raises FileNotFoundError if the manifest file is missing. include(modalities): Specifies the modalities to include in the download process. stats(): Prints and returns statistics of the cohort, including file count and total size, grouped by modality. _download_gdc_files(threads, include=None, exclude=None): Downloads files from GDC using the GDCFileDownloader class. _download_tcia_files(threads, include=None, exclude=None): Downloads files from TCIA using the TCIAFileDownloader class.

Source code in app/med_minds/__init__.py
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
class Cohort:
    """
    Cohort class for managing and processing medical data.

    Attributes:
        data (any): The input data for the cohort.
        output_dir (str): The directory where output files will be stored.
        manifest_file (str): The path to the manifest file in the output directory.
    Methods:
        __init__(data, output_dir):
            Initializes the Cohort instance with data and output directory.
        generate_manifest():
            Generates a manifest file for the cohort using the Aggregator class.
        download(threads=4, include=None, exclude=None):
            Downloads files specified in the manifest. Raises FileNotFoundError if the manifest file is missing.
        include(modalities):
            Specifies the modalities to include in the download process.
        stats():
            Prints and returns statistics of the cohort, including file count and total size, grouped by modality.
        _download_gdc_files(threads, include=None, exclude=None):
            Downloads files from GDC using the GDCFileDownloader class.
        _download_tcia_files(threads, include=None, exclude=None):
            Downloads files from TCIA using the TCIAFileDownloader class.
    """

    def __init__(self, data, output_dir):
        self.data = data
        self.output_dir = output_dir
        self.manifest_file = os.path.join(output_dir, "manifest.json")

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

    def generate_manifest(self):
        aggregator = Aggregator(self.data, self.output_dir)
        aggregator.generate_manifest()

    def download(self, threads: int = 4, include: list = None, exclude: list = None):
        if not os.path.exists(self.manifest_file):
            raise FileNotFoundError(
                f"No manifest file found in {self.output_dir}. Please run generate_manifest first."
            )

        self._download_gdc_files(threads, include=include, exclude=exclude)
        self._download_tcia_files(threads, include=include, exclude=exclude)

    def include(self, modalities):
        print(f"Only including {modalities} modalities in download")

    def stats(self):
        """Prints the statistics of the cohort in terms of file count and total size

        Returns:
            dict: A dictionary containing the statistics of the cohort
        """
        with open(self.manifest_file, "r") as f:
            manifest = json.load(f)

        stats_dict = {}
        for entry in manifest:
            for key, value in entry.items():
                if isinstance(value, list):
                    patient_size = 0
                    for file in value:
                        try:
                            patient_size += file["file_size"]
                        except Exception as e:
                            console.print(f"Error calculating size for {file}: {e}")
                            pass

                    if key not in stats_dict:
                        stats_dict[key] = {
                            "file_count": len(value),
                            "total_size": patient_size,
                        }
                    else:
                        stats_dict[key]["file_count"] += len(value)
                        stats_dict[key]["total_size"] += patient_size

        # Sort the dictionary by total size in descending order
        sorted_stats = sorted(
            stats_dict.items(), key=lambda x: x[1]["total_size"], reverse=True
        )

        table = Table(show_header=True, header_style="bold green")
        table.add_column("Modality")
        table.add_column("File Count")
        table.add_column("Total Size")

        for key, value in sorted_stats:
            size = value["total_size"]
            if size > 1024 * 1024 * 1024:
                size = f"{size / (1024 * 1024 * 1024):.2f} GB"
            elif size > 1024 * 1024:
                size = f"{size / (1024 * 1024):.2f} MB"
            else:
                size = f"{size / 1024:.2f} KB"
            table.add_row(key, str(value["file_count"]), size)

        console.print(table)
        return dict(sorted_stats)

    def _download_gdc_files(self, threads, include=None, exclude=None):
        gdc_downloader = GDCFileDownloader(
            self.output_dir, MAX_WORKERS=threads, include=include, exclude=exclude
        )
        gdc_downloader.process_cases()

    def _download_tcia_files(self, threads, include=None, exclude=None):
        tcia_downloader = TCIAFileDownloader(
            self.output_dir, MAX_WORKERS=threads, include=include, exclude=exclude
        )
        tcia_downloader.process_cases()

stats()

Prints the statistics of the cohort in terms of file count and total size

Returns:
  • dict

    A dictionary containing the statistics of the cohort

Source code in app/med_minds/__init__.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def stats(self):
    """Prints the statistics of the cohort in terms of file count and total size

    Returns:
        dict: A dictionary containing the statistics of the cohort
    """
    with open(self.manifest_file, "r") as f:
        manifest = json.load(f)

    stats_dict = {}
    for entry in manifest:
        for key, value in entry.items():
            if isinstance(value, list):
                patient_size = 0
                for file in value:
                    try:
                        patient_size += file["file_size"]
                    except Exception as e:
                        console.print(f"Error calculating size for {file}: {e}")
                        pass

                if key not in stats_dict:
                    stats_dict[key] = {
                        "file_count": len(value),
                        "total_size": patient_size,
                    }
                else:
                    stats_dict[key]["file_count"] += len(value)
                    stats_dict[key]["total_size"] += patient_size

    # Sort the dictionary by total size in descending order
    sorted_stats = sorted(
        stats_dict.items(), key=lambda x: x[1]["total_size"], reverse=True
    )

    table = Table(show_header=True, header_style="bold green")
    table.add_column("Modality")
    table.add_column("File Count")
    table.add_column("Total Size")

    for key, value in sorted_stats:
        size = value["total_size"]
        if size > 1024 * 1024 * 1024:
            size = f"{size / (1024 * 1024 * 1024):.2f} GB"
        elif size > 1024 * 1024:
            size = f"{size / (1024 * 1024):.2f} MB"
        else:
            size = f"{size / 1024:.2f} KB"
        table.add_row(key, str(value["file_count"]), size)

    console.print(table)
    return dict(sorted_stats)

build_cohort(output_dir, query=None, gdc_cohort=None, manifest=None)

Builds a cohort based on a query or a GDC cohort file and returns a Cohort object.

Source code in app/med_minds/__init__.py
177
178
179
180
181
182
183
184
185
186
187
188
189
def build_cohort(output_dir, query=None, gdc_cohort=None, manifest=None):
    """Builds a cohort based on a query or a GDC cohort file and returns a Cohort object."""
    if query:
        cohort_data = db.get_minds_cohort(query)
    elif gdc_cohort:
        cohort_data = db.get_gdc_cohort(gdc_cohort)
    else:
        raise ValueError("Either a query or a gdc_cohort file must be provided")

    cohort = Cohort(cohort_data, output_dir)
    if manifest is None:
        cohort.generate_manifest()
    return cohort

get_columns(table)

Get the list of columns in a table

Parameters

table : str The name of the table

Returns

list A list of columns in the table

Source code in app/med_minds/__init__.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def get_columns(table):
    """Get the list of columns in a table

    Parameters
    ----------
    table : str
        The name of the table

    Returns
    -------
    list
        A list of columns in the table
    """
    return db.get_columns(table)

get_tables()

Get the list of tables in the database

Returns

list A list of tables in the database

Source code in app/med_minds/__init__.py
19
20
21
22
23
24
25
26
27
def get_tables():
    """Get the list of tables in the database

    Returns
    -------
    list
        A list of tables in the database
    """
    return db.get_tables()

query(query)

Query the database and return the result as a pandas dataframe

Parameters

query_string : str The query string to be executed on the database

Returns

pandas.DataFrame The result of the query

Source code in app/med_minds/__init__.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def query(query):
    """Query the database and return the result as a pandas dataframe

    Parameters
    ----------
    query_string : str
        The query string to be executed on the database

    Returns
    -------
    pandas.DataFrame
        The result of the query
    """
    return db.execute(query)