Skip to content

download

ensembl.io.genomio.assembly.download

Download an assembly data files from INSDC or RefSeq.

FTPConnectionError

Bases: Exception

Error while initialising an FTP connection.

Source code in src/python/ensembl/io/genomio/assembly/download.py
56
57
class FTPConnectionError(Exception):
    """Error while initialising an FTP connection."""

FileDownloadError

Bases: Exception

When a file download fails or there is a problem with that file.

Source code in src/python/ensembl/io/genomio/assembly/download.py
52
53
class FileDownloadError(Exception):
    """When a file download fails or there is a problem with that file."""

UnsupportedFormatError

Bases: Exception

When a string does not have the expected format.

Source code in src/python/ensembl/io/genomio/assembly/download.py
60
61
class UnsupportedFormatError(Exception):
    """When a string does not have the expected format."""

download_files(ftp_connection, accession, dl_dir, max_redo)

Given an INSDC accession, download all available files from the ftp to the download dir

Parameters:

Name Type Description Default
ftp_connection FTP

An open FTP connection object

required
accession str

Genome assembly accession.

required
dl_dir Path

Path to downloaded FTP files.

required
max_redo int

Maximum FTP connection retry attempts.

required
Source code in src/python/ensembl/io/genomio/assembly/download.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
def download_files(ftp_connection: FTP, accession: str, dl_dir: Path, max_redo: int) -> None:
    """
    Given an INSDC accession, download all available files from the ftp to the download dir

    Args:
        ftp_connection: An open FTP connection object
        accession: Genome assembly accession.
        dl_dir: Path to downloaded FTP files.
        max_redo: Maximum FTP connection retry attempts.
    """

    # Get the list of assemblies for this accession
    for ftp_dir, _ in ftp_connection.mlsd():
        if re.search(accession, ftp_dir):
            ftp_connection.cwd(ftp_dir)

            # First, get the md5sum file
            md5_file = "md5checksums.txt"
            md5_path = dl_dir / md5_file
            with md5_path.open("wb") as fp:
                ftp_connection.retrbinary(f"RETR {md5_file}", fp.write)
            md5_sums = get_checksums(md5_path)

            # Get all the files
            for ftp_file, _ in ftp_connection.mlsd():
                for end in _FILE_ENDS:
                    if ftp_file.endswith(end) and not ftp_file.endswith(f"_from_{end}"):
                        _download_file(ftp_connection, ftp_file, md5_sums, dl_dir, max_redo)
        else:
            logging.warning(
                f"Could not find accession '{accession}' from ftp {ftp_dir} in open FTP connection"
            )

establish_ftp(ftp_conn, ftp_url, accession)

Return an FTP connection based on the provided accession and sub_dir.

Parameters:

Name Type Description Default
ftp_conn FTP

FTP class object.

required
ftp_url str

Specific FTP URL in connection request.

required
accession str

Genome accession required data for download.

required

Raises:

Type Description
UnsupportedFormatError

If accession does not follow INSDC's accession format.

Source code in src/python/ensembl/io/genomio/assembly/download.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def establish_ftp(ftp_conn: FTP, ftp_url: str, accession: str) -> FTP:
    """Return an FTP connection based on the provided `accession` and `sub_dir`.

    Args:
        ftp_conn: FTP class object.
        ftp_url: Specific FTP URL in connection request.
        accession: Genome accession required data for download.

    Raises:
        UnsupportedFormatError: If `accession` does not follow INSDC's accession format.
    """

    match = re.match(r"^(GC[AF])_([0-9]{3})([0-9]{3})([0-9]{3})(\.[0-9]+)?$", accession)
    if not match:
        raise UnsupportedFormatError(f"Could not recognize GCA accession format: {accession}")
    gca = match.group(1)
    part1 = match.group(2)
    part2 = match.group(3)
    part3 = match.group(4)
    sub_dir = Path("genomes", "all", gca, part1, part2, part3)

    # Try now to establish connection to remote FTP server
    ftp_conn.connect(ftp_url)
    ftp_conn.login()
    ftp_conn.cwd(str(sub_dir))

    return ftp_conn

get_checksums(checksum_path)

Get a dict of checksums from a file, with file names as keys and sums as values

Parameters:

Name Type Description Default
checksum_path Path

Path location to MD5 checksum file.

required
Source code in src/python/ensembl/io/genomio/assembly/download.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
def get_checksums(checksum_path: Path) -> Dict[str, str]:
    """
    Get a dict of checksums from a file, with file names as keys and sums as values

    Args:
        checksum_path: Path location to MD5 checksum file.
    """
    sums: Dict[str, str] = {}
    if not checksum_path.is_file():
        return sums
    with checksum_path.open(mode="r") as fh:
        for line in fh:
            checksum, file_path = line.strip().split("  ")
            file_path = file_path[2:]
            if not file_path.find("/") >= 0:
                sums[file_path] = checksum
    return sums

get_files_selection(dl_dir)

Returns a dictionary with the relevant downloaded files classified.

Parameters:

Name Type Description Default
dl_dir Path

Local path to downloaded FTP files.

required

Returns:

Type Description
Dict[str, str]

Dictionary of file type (e.g."report") as keys and the relative file path (from dl_dir) as values.

Raises:

Type Description
FileDownloadError

If dl_dir tree does not include a file named *_assembly_report.txt.

Source code in src/python/ensembl/io/genomio/assembly/download.py
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
def get_files_selection(dl_dir: Path) -> Dict[str, str]:
    """Returns a dictionary with the relevant downloaded files classified.

    Args:
        dl_dir: Local path to downloaded FTP files.

    Returns:
        Dictionary of file type (e.g.`"report"`) as keys and the relative file path (from `dl_dir`) as values.

    Raises:
        FileDownloadError: If `dl_dir` tree does not include a file named `*_assembly_report.txt`.
    """
    files = {}
    root_name = get_root_name(dl_dir)
    if root_name == "":
        raise FileDownloadError(f"Could not determine the files root name in {dl_dir}")
    for dl_file in dl_dir.iterdir():
        for end, name in _FILE_ENDS.items():
            file_with_end = dl_file.name.endswith(end) and not dl_file.name.endswith(f"_from_{end}")
            if (root_name and dl_file.name == root_name + end) or file_with_end:
                files[name] = str(dl_file)
    return files

get_root_name(dl_dir)

Returns the root name, i.e. shared files basename prefix, using the assembly report file as base.

Parameters:

Name Type Description Default
dl_dir Path

Path location of downloaded FTP files.

required
Source code in src/python/ensembl/io/genomio/assembly/download.py
266
267
268
269
270
271
272
273
274
275
276
277
278
def get_root_name(dl_dir: Path) -> str:
    """Returns the root name, i.e. shared files basename prefix, using the assembly report file as base.

    Args:
        dl_dir: Path location of downloaded FTP files.
    """
    root_name = ""
    for dl_file in dl_dir.iterdir():
        matches = re.search("^(.+_)assembly_report.txt", dl_file.name)
        if matches:
            root_name = matches.group(1)
            break
    return root_name

main()

Module's entry-point.

Source code in src/python/ensembl/io/genomio/assembly/download.py
330
331
332
333
334
335
336
337
338
339
340
341
342
def main() -> None:
    """Module's entry-point."""
    parser = ArgumentParser(description="Download an assembly data files from INSDC or RefSeq.")
    parser.add_argument("--accession", required=True, help="Genome assembly accession")
    parser.add_argument_dst_path(
        "--download_dir", default=Path.cwd(), help="Folder where the data will be downloaded"
    )
    parser.add_argument("--version", action="version", version=ensembl.io.genomio.__version__)
    parser.add_log_arguments()
    args = parser.parse_args()
    init_logging_with_args(args)

    retrieve_assembly_data(args.accession, args.download_dir)

md5_files(dl_dir, md5_path=None, md5_filename='md5checksums.txt')

Check all files checksums with the sums listed in a checksum file, if available. Return False if there is no checksum file, or a file is missing, or has a wrong checksum.

Parameters:

Name Type Description Default
dl_dir Path

Path location to containing downloaded FTP files.

required
md5_path Optional[Path]

Full path to an MD5 checksum file.

None
md5_filename str

Name of a checksum file in the dl_dir (used if no md5_path is given).

'md5checksums.txt'
Source code in src/python/ensembl/io/genomio/assembly/download.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
def md5_files(dl_dir: Path, md5_path: Optional[Path] = None, md5_filename: str = "md5checksums.txt") -> bool:
    """
    Check all files checksums with the sums listed in a checksum file, if available.
    Return False if there is no checksum file, or a file is missing, or has a wrong checksum.

    Args:
        dl_dir: Path location to containing downloaded FTP files.
        md5_path: Full path to an MD5 checksum file.
        md5_filename: Name of a checksum file in the `dl_dir` (used if no `md5_path` is given).
    """
    # Get or set md5 file to user or default setting
    if md5_path is None:
        md5_path = dl_dir / md5_filename

    # Get checksums and compare
    sums = get_checksums(md5_path)
    if not sums:
        return False
    logging.info(f" File sums from {md5_path}: {len(sums)}")
    for dl_file, checksum in sums.items():
        for end in _FILE_ENDS:
            if dl_file.endswith(end) and not dl_file.endswith(f"_from_{end}"):
                file_path = dl_dir / dl_file
                if not file_path.is_file():
                    logging.warning(f" No file {file_path} found")
                    return False
                # Check the file checksum
                with file_path.open(mode="rb") as f:
                    content = f.read()
                    file_sum = hashlib.md5(content).hexdigest()
                if file_sum != checksum:
                    logging.warning(f" File {file_path} checksum doesn't match")
                    return False
                logging.info(f" File checksum ok {file_path}")
    logging.info(" All checksums OK")
    return True

retrieve_assembly_data(accession, download_dir, max_increment=0, max_redo=3)

Establishes an FTP connection and downloads a predefined subset of assembly data files from either INSDC or RefSeq.

Parameters:

Name Type Description Default
accession str

Genome assembly accession.

required
download_dir PathLike

Path to where to download FTP files.

required
max_increment int

If you want to allow assembly versions.

0
max_redo int

Maximum FTP connection retry attempts.

3

Raises:

Type Description
FileDownloadError

If no files are downloaded or if any does not match its MD5 checksum.

Source code in src/python/ensembl/io/genomio/assembly/download.py
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
def retrieve_assembly_data(
    accession: str,
    download_dir: PathLike,
    max_increment: int = 0,
    max_redo: int = 3,
) -> None:
    """Establishes an FTP connection and downloads a predefined subset of assembly data files from either
    INSDC or RefSeq.

    Args:
        accession: Genome assembly accession.
        download_dir: Path to where to download FTP files.
        max_increment: If you want to allow assembly versions.
        max_redo: Maximum FTP connection retry attempts.

    Raises:
        FileDownloadError: If no files are downloaded or if any does not match its MD5 checksum.
    """
    download_dir = Path(download_dir)

    # Set and create dedicated dir for download
    download_dir.mkdir(parents=True, exist_ok=True)

    # Download if files don't exist or fail checksum
    if not md5_files(download_dir, None):
        logging.info(" Download the files")

        for increment in range(0, max_increment + 1):
            if increment > 0:
                logging.info(f" Increment accession version once from {accession}")
                version = int(accession[-1])
                version += 1
                accession = accession[:-1] + str(version)
                download_dir.mkdir(parents=True, exist_ok=True)
            ftp_url = "ftp.ncbi.nlm.nih.gov"
            ftp_instance = FTP()
            open_ftp_connection = establish_ftp(ftp_instance, ftp_url, accession)
            download_files(open_ftp_connection, accession, download_dir, max_redo)

        if not md5_files(download_dir, None):
            raise FileDownloadError("Failed md5sum of downloaded files")

    # Select specific files and give them a name
    files = get_files_selection(download_dir)

    if len(files) == 0:
        raise FileDownloadError("No file downloaded")