Skip to content

load_sequence_data

ensembl.brc4.runnable.load_sequence_data

load_sequence_data

Bases: BaseRunnable

loading sequence data, seq region names, atrributes and synonyms

eHive module to load sequnce data, seq region names, atrributes and synonyms from FASTAs AGPs and seq_region.json. Various ensembl-analysis perl scripts are used to create coord_systems, load sequences and set attributes. SQL commands through out the code to be replaces with the proper python API at some point.

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
  31
  32
  33
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
class load_sequence_data(eHive.BaseRunnable):
    """
    loading sequence data, seq region names, atrributes and synonyms

    eHive module to load sequnce data, seq region names, atrributes and synonyms from FASTAs AGPs and seq_region.json.
    Various ensembl-analysis perl scripts are used to create coord_systems, load sequences and set attributes.
    SQL commands through out the code to be replaces with the proper python API at some point.
    """

    def param_defaults(self):
        """
        default parameter/options values
        """
        return {
            # relative order of the coord_systems types to infer their rank from
            "cs_order": "ensembl_internal,chunk,contig,supercontig,non_ref_scaffold,scaffold,primary_assembly,superscaffold,linkage_group,chromosome",
            "artificial_cs": "ensembl_internal,chunk",  # coord_systems to ignore when adding synonyms for sequence_level cs
            "IUPAC": "RYKMSWBDHV",  # symbols to be replaced with N in the DNA sequences (ensembl core(107) doesn't support the whole IUPAC alphabet for DNA)
            # unversion scaffold, remove ".\d$" from seq_region.names if there's a need
            "unversion_scaffolds": 0,
            "versioned_sr_syn_src": "INSDC",  # INSDC(50710) # if unversioning non-sequence level cs, store original name (with version) as this synonym
            "sr_syn_src": "BRC4_Community_Symbol",  # BRC4_Community_Symbol(211) # if unversioning sequence-level cs, store original name (with version) as this synonym
            # nullify coord_system version for the given coord_system name
            "nullify_cs_version_from": "contig",
            # default coord_system name for single-level (no AGPs assemblies)
            "noagp_cs_name_default": "primary_assembly",
            # file for additional mapping of the synonym sources to the external_db (ensembl), as used by "get_external_db_mapping" function below
            "external_db_map": None,
            # set "coord_system_tag" seq_region attribute to this value if there's a corresponding "chromosome_display_order" list in genome.json metadata
            #   if None, only "chromosome" coord system is processed (if present)
            #   (see add_chr_karyotype_rank definition below )
            #   if seq_region already has `coord_system_tagi` attribute, it value updated only if "force_update_coord_system_tag" module param is True (see below)
            "cs_tag_for_ordered": None,
            # Force updating of the "coord_system_tags" attribute for seq_regions from `cs_tag_for_ordered` (see above)
            "force_update_coord_system_tag": False,
            # BRC4 compatibility mode; if on, "(EBI|BRC4)_seq_region_name" seq_region_attributes are added.
            #   Blocked by the "swap_gcf_gca" option. In this case insertion should be done on later pipeline stage after seq_region name swapping.
            "brc4_mode": True,
            # Whether to use RefSeq names as additional seq_region synonyms (if available) or not (see add_sr_synonyms definition below)
            #  Does not swap anything actually, just loads synonyms to be used by a later "swapping" stage
            #  Disables BRC4 compatibilty mode (see the "brc4_mode" option comment).
            "swap_gcf_gca": False,
            # list of coord systems used in "-ignore_coord_system" options of the "ensembl-analysis/scripts/assembly_loading/set_toplevel.pl" script
            #   part of the loading process
            "not_toplevel_cs": [],  # i.e. "contig", "non_ref_scaffold"
            # explicit list of seq_region properties (keys) to load as seq_region_attrib s (values) (see add_sr_attribs definition below)
            #   if a dict's used as a value, treat its keys as "json_path" (/ as delim) map, i.e.
            #       { "added_sequence" : { "assembly_provider" : { "name" : ... } } } -> "added_sequence/assembly_provider/name"
            #   only flattable properties can be used, no arrays
            #   arrays should be processed separately (see `add_sr_synonyms` or `add_karyotype_bands` definitions)
            # see src/python/ensembl/io/genomio/data/schemas/seq_region.json
            "sr_attrib_types": {
                "circular": "circular_seq",
                "codon_table": "codon_table",
                "location": "sequence_location",
                "non_ref": "non_ref",
                "coord_system_level": "coord_system_tag",
                "added_sequence": {
                    # json_path to attrib_type_code(str) mapping
                    "added_sequence/accession": "added_seq_accession",
                    "added_sequence/assembly_provider/name": "added_seq_asm_pr_nam",
                    "added_sequence/assembly_provider/url": "added_seq_asm_pr_url",
                    "added_sequence/annotation_provider/name": "added_seq_ann_pr_nam",
                    "added_sequence/annotation_provider/url": "added_seq_ann_pr_url",
                },  # added_sequence
            },
            # loading additional sequences to the already exsisting core db
            "load_additional_sequences": 0,
            # size of the sequence data chunk, if 0 (default), no chunking is performed
            "sequence_data_chunk": 0,
            #   min size of the sequence chunk, no chunking is done if 'sequence_data_chunk' < 'sequence_data_chunk_min'
            "sequence_data_chunk_min_len": 50_000,
            # coord system name for chunks
            "chunk_cs_name": "ensembl_internal",
        }

    def run(self):
        """
        Entry point for the Ehive module. All processing is done here in this case.
        """
        # params
        work_dir = self.param_required("work_dir")

        # initial sequence loading, using ensembl-analysis scripts
        self.initial_sequence_loading(work_dir)

        # load data from the corresponding core db tables
        external_db_map = self.load_map_from_core_db(
            "external_db", ["db_name", "external_db_id"], work_dir
        )  # for external_db
        attrib_type_map = self.load_map_from_core_db(
            "attrib_type", ["code", "attrib_type_id"], work_dir
        )  # for attrib_type
        seq_region_map = self.load_map_from_core_db(
            "seq_region", ["name", "seq_region_id"], work_dir
        )  # for seq_region

        # update synonyms and seq_region_attribs
        unversion = self.param("unversion_scaffolds")
        is_primary_assembly = self.from_param("manifest_data", "agp", not_throw=True) is None
        seq_region_file = self.from_param("manifest_data", "seq_region", not_throw=True)

        #   add seq_region synonyms
        self.add_sr_synonyms(
            seq_region_file,
            seq_region_map,
            external_db_map,
            self.pjc(work_dir, "seq_region_syns"),
            unversion=unversion,
        )

        #   add seq_region attributes
        self.add_sr_attribs(
            seq_region_file,
            seq_region_map,
            attrib_type_map,
            self.pjc(work_dir, "seq_region_attr"),
            unversion=unversion,
        )

        #   add seq_region EBI and BRC4 name attributes in the "BRC4 mode"
        #     special case of attributes adding with default values derived from seq_region names
        #     do not add if preparing to swap RefSeq and GeneBank ids; in this case attributes to be added at a later stage in pipeline
        #     (easier to insert then to update)
        if self.param("brc4_mode") and not self.param("swap_gcf_gca"):
            self.add_sr_ebi_brc4_names(
                seq_region_file,
                seq_region_map,
                attrib_type_map,
                self.pjc(work_dir, "seq_region_ebi_brc4_name"),
                unversion=unversion,
            )

        # add karyotype related data
        self.add_karyotype_data(
            seq_region_file,
            seq_region_map,
            attrib_type_map,
            self.pjc(work_dir, "karyotype"),
            unversion=unversion,
        )

    def initial_sequence_loading(self, work_dir: str):
        """
        initial preparation and loading of AGPs and fasta data.

        initial preparation and loading of AGPs and fasta data using ensembl-analysis perl scripts
        """
        # preprocess FASTA with sequences
        #   rename IUPAC to N symbols using sed
        fasta_clean = self.from_param("manifest_data", "fasta_dna")

        # start coord system ranking and agps processing
        agps = self.from_param("manifest_data", "agp", not_throw=True)

        # get the deafult coord_system order
        #   use noagp_cs_name_default for "noagp" assemblies
        cs_order = self.coord_sys_order(self.param("cs_order"))
        noagps_cs = self.param("noagp_cs_name_default")

        # remove gaps and lower_level mappings if the are coveres by higher level ones
        #   i.e.: remove 'contigN to chromosomeZ', if 'contigN to scaffoldM' and 'scaffoldM to chromosomeZ' are in place
        #   returns None if no agps provided
        agps_pruned_dir = self.pjc(work_dir, "agps_pruned")
        agps_pruned = self.prune_agps(agps, cs_order, agps_pruned_dir, self.param_bool("prune_agp"))

        # order
        # rank cs_names, met in agps.keys ("-" separated, i.e. "scaffold-contig") based on cs_order
        cs_rank = self.used_cs_ranks(agps_pruned, cs_order, noagps_cs)

        # chunk sequence data if needed
        #   no chunking if chunk_size < 50k
        chunk_size = int(self.param("sequence_data_chunk"))
        chunk_cs_name = self.param("chunk_cs_name")
        fasta_clean, cs_rank, agps_pruned = self.chunk_contigs(
            fasta_clean,
            cs_rank,
            agps_pruned,
            pj(work_dir, "chunking"),
            chunk_size=chunk_size,
            chunks_cs_name=chunk_cs_name,
        )

        # empty agps_pruned ignored
        self.load_seq_data(fasta_clean, agps_pruned, cs_rank, self.pjc(work_dir, "load"))

        # mark all the "contig"s or noagp_cs as being sourced from ENA
        if not self.param_bool("no_contig_ena_attrib"):
            # NB using original "agps" parameter (with no chunking data added)
            agps_raw = self.from_param("manifest_data", "agp", not_throw=True)
            if agps_raw is None:
                self.add_contig_ena_attrib(self.pjc(work_dir, "load", "set_ena"), cs_name=noagps_cs)
            else:
                self.add_contig_ena_attrib(self.pjc(work_dir, "load", "set_ena"))

        # unversion scaffold, remove ".\d$" from names if there's a need
        if self.param_bool("unversion_scaffolds"):
            self.unversion_scaffolds(cs_rank, self.pjc(work_dir, "unversion_scaffolds"))

        # add assembly mappings between various cs to meta table for the mapper to work properly
        cs_pairs = agps_pruned and agps_pruned.keys() or None
        self.add_asm_mappings(cs_pairs, self.pjc(work_dir, "asm_mappings"))

        # set toplevel seq_region attribute
        self.set_toplevel(self.pjc(work_dir, "set_toplevel"), self.param("not_toplevel_cs"))

        # nullify contig version and update mappings strings accordingly; ignore for "load_additional_sequences" mode
        if not self.param_bool("load_additional_sequences"):
            self.nullify_ctg_cs_version(cs_order, self.pjc(work_dir, "asm_mapping", "nullify_cs_versions"))

    def add_sr_synonyms(
        self,
        seq_region_file: str,
        seq_region_map: dict,
        external_db_map: dict,
        work_dir: str,
        unversion: bool = False,
        unversionable_sources_set: frozenset = frozenset(["INSDC", "RefSeq"]),
    ):
        """
        Add seq_region_synonym from the seq_region_file meta data file.

        Add seq_region_synonym from the src/python/ensembl/io/genomio/data/schemas/seq_region.json compatible meta data file.
        Merge with the already existing ones in the db.

        If unversion is true:
          * the unversioned synonym would be used to get the seq_region_id from "seq_region_map" if possible
          * the unversioned synonyms from the unversionable_sources_set will be added as well as the original ones

        Too close to the DB schema.
        """
        os.makedirs(work_dir, exist_ok=True)

        # return if there's nothing to add
        if not seq_region_file:
            return

        # get seq_region ids, names, syns from db
        synonyms_trios_db = self.load_seq_region_synonyms_trios_from_core_db(
            self.pjc(work_dir, "syns_from_core")
        )

        # form set of synonyms already present in db
        synonyms_in_db = frozenset([trio[2] for trio in synonyms_trios_db if trio[2] != "NULL"])

        # subset of sources to use the allowed the unversion synonyms
        unversionable_sources = unversion and unversionable_sources_set or frozenset()

        # technical / optimization. get external_db_id for "ensembl_internal_synonym"
        ensembl_internal_synonym_ext_db_id = self.id_from_map_or_die(
            "ensembl_internal_synonym", external_db_map, "external_db_map"
        )

        # get dict for additional mapping for sources to external_db names if there's one specified by "external_db_map" module param
        #   not to be confused with the external_db_map function parameter above
        additional_sources_mapping = self.get_external_db_mapping()

        # load synonyms from the json file
        synonyms_from_json = (
            []
        )  # [ (seq_region_id, synonym, external_db_id)... ] list of trios for inserting into db
        with open(seq_region_file) as in_file:
            seq_regions = list(json.load(in_file))
            for seq_region in filter(lambda sr: sr.get("synonyms", False), seq_regions):
                # iterate through all seq_regions having "synonyms"
                seq_region_name, seq_region_id, _ = self.name_and_id_from_seq_region_item(
                    seq_region, seq_region_map, try_unversion=unversion
                )

                # fill synonyms_from_json list of trios
                for synonym_item in list(seq_region["synonyms"]):
                    synonym_name = synonym_item["name"]
                    source = synonym_item["source"]
                    unversioned_name = ""

                    # check if there's any addtional mapping for the source, remap if so
                    if additional_sources_mapping:
                        source = additional_sources_mapping.get(
                            source, source
                        )  # use the same name if no matches in additional_sources_mapping dict

                    # try to get unversioned name if applicable
                    if source in unversionable_sources:
                        unversioned_name = re.sub(r"\.\d+$", "", synonym_name)

                    # put trios if names are not already seen in db
                    if synonym_name not in synonyms_in_db:
                        external_db_id = self.id_from_map_or_die(source, external_db_map, "external_db_map")
                        synonyms_from_json.append(
                            (seq_region_id, self.quote_or_null(synonym_name), external_db_id)
                        )

                    #   put additional unversioned synonyms if there's a sane one
                    if (
                        unversioned_name
                        and unversioned_name != synonym_name
                        and unversioned_name not in synonyms_in_db
                    ):
                        synonyms_from_json.append(
                            (
                                seq_region_id,
                                self.quote_or_null(unversioned_name),
                                ensembl_internal_synonym_ext_db_id,
                            )
                        )

        # run insertion SQL
        self.insert_to_db(
            synonyms_from_json,
            "seq_region_synonym",
            ["seq_region_id", "synonym", "external_db_id"],
            self.pjc(work_dir, "new_seq_region_synonyms"),
            ignore=True,
        )

    def add_sr_attribs(
        self,
        seq_region_file: str,
        seq_region_map: dict,
        attrib_type_map: dict,
        work_dir,
        unversion: bool = False,
    ):
        """
        Add seq_region_attrib(s) from the seq_region_file meta data file.

        Explicit list is taken from "sr_attrib_types" module param.

        Add seq_region_attrib(s) from the src/python/ensembl/io/genomio/data/schemas/seq_region.json compatible meta data file.
        Explicit list is taken from "sr_attrib_types" module param.

        "sr_attrib_types" defines { json_property -> attrib_type.name } map. If the value is dict,
        its keys are treated as "/"-delimited "json_path" (i.e. "added_sequence/assembly_provider/name").
        No arrays can be processed. Only simple or "flattable" types.

        If unversion is true:
          * the unversioned synonym would be used to get the seq_region_id from "seq_region_map" if possible

        Too close to the DB schema.
        """
        os.makedirs(work_dir, exist_ok=True)

        # return if there's nothing to add
        if not seq_region_file:
            return

        # technical / optimization. get atttib_type_id(s)
        # create a smaller map with attrib_type_id(s) as values
        properties_to_use = (
            []
        )  # [frozen]set with the top-level "seq_region" properties, that should be processed
        path_attrib_id_map = dict()  # { "flatterned/json/paths" : attrib_id_map ))}
        # fill set and map
        for prop, attrib_type in self.param("sr_attrib_types").items():
            # adding high level properties to process
            properties_to_use.append(prop)
            # adding json paths (or properties themselves) to atrrib_type_id map
            if isinstance(attrib_type, dict):  # if using json paths (delimeterd with "/")
                for path, inner_attrib_type in attrib_type.items():
                    path_attrib_id_map[path] = self.id_from_map_or_die(
                        inner_attrib_type, attrib_type_map, "attrib_type_map"
                    )
            else:
                path_attrib_id_map[prop] = self.id_from_map_or_die(
                    attrib_type, attrib_type_map, "attrib_type_map"
                )
        # return if there's nothing to add
        if not properties_to_use:
            return

        properties_to_use = frozenset(properties_to_use)

        # load attributes from seq_region file
        attrib_trios = []  # [ (seq_region_id, attrib_id, value)... ] list of trios for inserting into db
        with open(seq_region_file) as in_file:
            seq_regions = list(json.load(in_file))
            for seq_region in seq_regions:
                # get seq_region_id (perhaps, by using unversioned name)
                seq_region_name, seq_region_id, unversioned_name = self.name_and_id_from_seq_region_item(
                    seq_region, seq_region_map, try_unversion=unversion
                )

                # iterate through properties
                for prop_name in properties_to_use:
                    if prop_name not in seq_region:
                        continue
                    # flattern path
                    path_attrib_id_values_list = self.flattern_seq_region_item(
                        seq_region, prop_name, path_attrib_id_map
                    )
                    # fill attrib_trios
                    for path, attrib_id, value in path_attrib_id_values_list:
                        attrib_trios.append((seq_region_id, attrib_id, self.quote_or_null(value)))

        # run insertion SQL
        self.insert_to_db(
            attrib_trios,
            "seq_region_attrib",
            ["seq_region_id", "attrib_type_id", "value"],
            self.pjc(work_dir, "brc4_ebi_seq_region_synonyms"),
            ignore=True,
        )

    def flattern_seq_region_item(
        self, seq_region: dict, prop_name: str, path_attrib_id_map: dict, sep: str = "/"
    ) -> list:
        """
        Flattern seq_region[property] and store corresponding [ (json_path, attrib_id, value)... ] (as list of trios).

        Only works for simple properties or dicts with no arrays on the path. Basically, implemets tree traversal.
        Utility function used by the `add_sr_attribs` method
        """
        res = []
        # is there anything to do
        if prop_name not in seq_region:
            return res

        # set up
        value = seq_region[prop_name]
        paths_to_go = [
            (prop_name, value)
        ]  # storing path and the corresponding value, to prevent repetetive traversals
        # iterate
        while paths_to_go:
            (path, value) = paths_to_go.pop()  # get last item
            if isinstance(value, list):
                # perhaps, it's better to raise exception then to continue silently
                continue
            if isinstance(value, dict):
                # if value is a complex object, add its leaves
                for key, val in value.items():
                    paths_to_go.append((f"{path}{sep}{key}", val))
                continue
            # if value is simple
            attrib_id = path_attrib_id_map.get(path, None)
            if attrib_id:
                res.append((path, attrib_id, value))
        # return what ever we have
        return res

    def add_sr_ebi_brc4_names(
        self,
        seq_region_file: str,
        seq_region_map: dict,
        attrib_type_map: dict,
        work_dir: str,
        unversion: bool = False,
    ):
        """
        Add "(EBI|BRC4)_seq_region_name" seq_region_attrib(s) either from the seq_region_file meta data file, or from original seq_region names.

        Add "(EBI|BRC4)_seq_region_name" seq_region_synonym from the src/python/ensembl/io/genomio/data/schemas/seq_region.json compatible meta data file or from the original seq_region_names.
        A special case of attributes adding with default values derived from seq_region names.

        If unversion is true:
          * the unversioned synonym would be used to get the seq_region_id from "seq_region_map" if possible

        Too close to the DB schema.
        """
        os.makedirs(work_dir, exist_ok=True)

        # return if there's nothing to add
        if not seq_region_file:
            return

        # technical / optimization. get atttib_type_id(s) for "(EBI|BRC4)_seq_region_name"
        tagged_sr_name_attrib_id = {
            tag: self.id_from_map_or_die(f"{tag}_seq_region_name", attrib_type_map, "attrib_type_map")
            for tag in ["EBI", "BRC4"]
        }

        # load BRC4/EBI name from seq_region file
        brc4_ebi_name_attrib_trios = (
            []
        )  # [ (seq_region_id, attrib_id, value)... ] list of trios for inserting into db
        with open(seq_region_file) as in_file:
            seq_regions = list(json.load(in_file))
            for seq_region in seq_regions:
                # get seq_region_id (perhaps, by using unversioned name)
                seq_region_name, seq_region_id, unversioned_name = self.name_and_id_from_seq_region_item(
                    seq_region, seq_region_map, try_unversion=unversion
                )
                # append attribs to the brc4_ebi_name_attrib_trios list
                for tag in ["BRC4", "EBI"]:
                    attrib_name = f"{tag}_seq_region_name"
                    attrib_id = tagged_sr_name_attrib_id[tag]
                    value = seq_region.get(attrib_name, seq_region_name)
                    brc4_ebi_name_attrib_trios.append((seq_region_id, attrib_id, self.quote_or_null(value)))

        # run insertion SQL
        self.insert_to_db(
            brc4_ebi_name_attrib_trios,
            "seq_region_attrib",
            ["seq_region_id", "attrib_type_id", "value"],
            self.pjc(work_dir, "brc4_ebi_seq_region_synonyms"),
            ignore=True,
        )

    def add_karyotype_data(
        self,
        seq_region_file: str,
        seq_region_map: dict,
        attrib_type_map: dict,
        work_dir: str,
        unversion: bool = False,
    ):
        """
        Adds various karyotypic data from seq_region file and assembly metadata (if present).

        Adds various karyotypic data from the src/python/ensembl/io/genomio/data/schemas/seq_region.json compatible meta data file and assembly metadata (if present).

        If unversion is true:
          * the unversioned synonym would be used to get the seq_region_id from "seq_region_map" if possible
        """
        # add karyotyope bands data
        regions_with_karyotype_bands = self.add_karyotype_bands(
            seq_region_file,
            seq_region_map,
            attrib_type_map,
            self.pjc(work_dir, "karyotype_bands"),
            unversion=unversion,
        )

        # try to add karyotype ranks for regions listed in genome_data/assembly/chromosome_display_order metadata
        regions_with_ranks_from_assembly_metadata = self.add_karyotype_rank_based_on_assembly_metadata(
            seq_region_map,
            attrib_type_map,
            self.pjc(work_dir, "karyotype_ranks_from_meta"),
            unversion=unversion,
        )

        regions_with_ranks_from_chromosome_cs = []
        if not regions_with_ranks_from_assembly_metadata:
            # try to add karyotype_ranks for top-level regions from the "chromosome" coord_system
            regions_with_ranks_from_chromosome_cs = self.add_karyotype_rank_for_chromosomes(
                attrib_type_map, self.pjc(work_dir, "karyotype_ranks_for_chromosomes")
            )

        # make sure that regions with bands have karyotype_ranks
        self.add_karyotype_rank_from_bands_info(
            regions_with_karyotype_bands,
            regions_with_ranks_from_chromosome_cs + regions_with_ranks_from_assembly_metadata,
            attrib_type_map,
            self.pjc(work_dir, "karyotype_ranks_from_bands"),
        )

    def add_karyotype_bands(
        self,
        seq_region_file: str,
        seq_region_map: dict,
        attrib_type_map: dict,
        work_dir: str,
        unversion: bool = False,
        karyotype_bands_property="karyotype_bands",
    ) -> list:  # [ (seq_region_name, seq_region_id, unversioned_name) ]
        """
        Add karyotypic data from the seq_region metafile.

        Add karyotypic data from the src/python/ensembl/io/genomio/data/schemas/seq_region.json compatible meta data file.
        Returns list of [ (seq_region_name, seq_region_id, unversioned_name) ] trios for seq_regions having karyotype bands info.

        If unversion is true:
          * the unversioned synonym would be used to get the seq_region_id from "seq_region_map" if possible

        Too close to the DB schema.
        """
        os.makedirs(work_dir, exist_ok=True)

        # return if there's nothing to add
        if not seq_region_file:
            return

        # resulting list of seq regions with bands
        seq_regions_with_karyotype_bands = []  # [ ( seq_region_name, seq_region_id, unversioned_name )... ]

        # load BRC4/EBI name from seq_region file
        band_tuples = (
            []
        )  # [ (seq_region_id, seq_region_start, seq_region_end, band|"NULL", stain|"NULL")... ] list of tuples for inserting into db
        with open(seq_region_file) as in_file:
            seq_regions = list(json.load(in_file))
            for seq_region in filter(lambda sr: sr.get(karyotype_bands_property, False), seq_regions):
                # iterate through all seq_regions having non-empty "karyotype_bands"

                # get seq_region_id (perhaps, by using unversioned name)
                seq_region_name, seq_region_id, unversioned_name = self.name_and_id_from_seq_region_item(
                    seq_region, seq_region_map, try_unversion=unversion
                )

                # append trio to the resulting list
                seq_regions_with_karyotype_bands.append((seq_region_name, seq_region_id, unversioned_name))

                # append bands to the band_tuples list
                for band in seq_region[karyotype_bands_property]:
                    # print("BAND: " + str(band), file = sys.stderr)
                    # coords
                    seq_region_start = band["start"]
                    seq_region_end = band["end"]
                    # band_name and stain
                    band_name = band.get("name", None)
                    stain = band.get("stain", None)
                    # special cases for stain
                    structure = band.get("structure", None)
                    if structure == "telomere":
                        stain = "TEL"
                    elif structure == "centromere":
                        stain = "ACEN"

                    # append tuple
                    band_tuples.append(
                        (
                            seq_region_id,
                            seq_region_start,
                            seq_region_end,
                            self.quote_or_null(band_name),
                            self.quote_or_null(stain),
                        )
                    )

        # run insertion SQL
        self.insert_to_db(
            band_tuples,
            "karyotype",
            ["seq_region_id", "seq_region_start", "seq_region_end", "band", "stain"],
            self.pjc(work_dir, "karyotype_insertion"),
            ignore=True,
        )

        # return resulting list of regions with bands trios
        return seq_regions_with_karyotype_bands

    def add_karyotype_rank_based_on_assembly_metadata(
        self, seq_region_map: dict, attrib_type_map: dict, work_dir: str, unversion: bool = True
    ) -> list:  # [ (seq_region_name, seq_region_id, unversioned_name) ]
        """
        Add `karyotype_rank` attributes for seq region data from based on metadata from the "genome_data" module parameter.
        Add only to the seq_regions with ids listed in the array corresponding to 'genome_data/assembly/chromosome_display_order'.

        Set "coord_system_tag" attribute to the one listed in the "cs_tag_for_ordered" module param; or "chromosome" if param value is underfined.
        Force updating of the "coord_system_tags" if `force_update_coord_system_tag` module param is True.

        Returns list of [ (seq_region_name, seq_region_id, unversioned_name) ] trios for seq_regions with updated karyotype_ranks.

        If unversion is true:
          * the unversioned synonym would be used to get the seq_region_id from "seq_region_map" if possible

        Too close to the DB schema.
        """

        # resulting list of seq_region with karyotype_rank
        regions_with_ranks_from_assembly_metadata = (
            []
        )  # [ ( seq_region_name, seq_region_id, unversioned_name )... ]

        # get `chromosome_display_order` list from the assembly metadata
        assembly_metadata = self.from_param("genome_data", "assembly", not_throw=True) or dict()
        chromosome_display_order_list = assembly_metadata.get("chromosome_display_order", [])

        # technical / optimization. get external_db_id for "karyotype_rank" and "coord_system_tag"
        karyotype_rank_attrib_id = self.id_from_map_or_die(
            "karyotype_rank", attrib_type_map, "attrib_type_map"
        )
        coord_system_tag_attrib_id = self.id_from_map_or_die(
            "coord_system_tag", attrib_type_map, "attrib_type_map"
        )
        coord_system_tag = self.param("cs_tag_for_ordered") or "chromosome"
        force_update_coord_system_tag = self.param("force_update_coord_system_tag") or False

        # set/update proper attributes for `chromosome_display_order_list` regions
        rank_insertions_trios = []
        coord_system_tag_attrib_insertion_trios = []
        coord_system_tag_attrib_seq_region_update_ids = []

        for seq_region_name_raw in chromosome_display_order_list:
            # wrap seq_region_name_raw into seq_region struct { "name": seq_region_name_raw } and
            # get seq_region_id (perhaps, by using unversioned name)
            seq_region_name, seq_region_id, unversioned_name = self.name_and_id_from_seq_region_item(
                {"name": seq_region_name_raw}, seq_region_map, try_unversion=unversion
            )

            # append trio to the resulting list
            regions_with_ranks_from_assembly_metadata.append(
                (seq_region_name, seq_region_id, unversioned_name)
            )

            # filling insert lists for "karyotype_rank" and "coord_system_tag" attributes
            rank_insertions_trios.append(
                (seq_region_id, karyotype_rank_attrib_id, len(rank_insertions_trios) + 1)
            )
            coord_system_tag_attrib_insertion_trios.append(
                (seq_region_id, coord_system_tag_attrib_id, self.quote_or_null(coord_system_tag))
            )

            # filling update list for "coord_system_tag" with seq_region_ids
            coord_system_tag_attrib_seq_region_update_ids.append(seq_region_id)

        # run insertion SQL for "karyotype_rank"
        self.insert_to_db(
            rank_insertions_trios,
            "seq_region_attrib",
            ["seq_region_id", "attrib_type_id", "value"],
            self.pjc(work_dir, "karyotype_rank_insertion"),
            ignore=True,
        )

        # run insertion SQL for "coord_system_tag"
        self.insert_to_db(
            coord_system_tag_attrib_insertion_trios,
            "seq_region_attrib",
            ["seq_region_id", "attrib_type_id", "value"],
            self.pjc(work_dir, "coord_system_tag_insertion"),
            ignore=True,
        )

        # forcing update of the "coord_system_tag"
        if force_update_coord_system_tag and coord_system_tag_attrib_seq_region_update_ids:
            seq_region_ids_str = ",".join(map(str, coord_system_tag_attrib_seq_region_update_ids))
            self.update_db_single_group(
                {"value": self.quote_or_null(coord_system_tag)},
                "seq_region_attrib",
                self.pjc(work_dir, "coord_system_tag_update"),
                where=f"attrib_type_id = {coord_system_tag_attrib_id} and seq_region_id in ({seq_region_ids_str})",
            )

        # return resulting list of regions with bands trios
        return regions_with_ranks_from_assembly_metadata

    def add_karyotype_rank_for_chromosomes(
        self, attrib_type_map: dict, work_dir: str, chromosome_coord_system_name="chromosome"
    ) -> list:  # [ (seq_region_name, seq_region_id, unversioned_name) ]
        """
        Add `karyotype_rank` attributes for seq region data from the "chromosome" coordinate system.

        Returns list of [ (seq_region_name, seq_region_id, unversioned_name) ] trios for seq_regions with updated karyotype_ranks.
        Not altering "coord_system_tag" tag attributes.

        If unversion is true:
          * the unversioned synonym would be used to get the seq_region_id from "seq_region_map" if possible

        Too close to the DB schema.
        """

        # resulting list of seq_region with karyotype_rank
        #   list of top level seq regions from the `chromosome_coord_system_name`
        chromomes_seq_regions = self.get_toplevel_from_cs(
            chromosome_coord_system_name, self.pjc(work_dir, "chromosome_seq_regions")
        )

        if not chromomes_seq_regions:
            return chromomes_seq_regions

        # technical / optimization. get external_db_id for "karyotype_rank"
        karyotype_rank_attrib_id = self.id_from_map_or_die(
            "karyotype_rank", attrib_type_map, "attrib_type_map"
        )

        # set/update proper attributes for "chromomosome" regions
        rank_insertions_trios = []
        for _, seq_region_id, _ in chromomes_seq_regions:
            rank_insertions_trios.append(
                (seq_region_id, karyotype_rank_attrib_id, len(rank_insertions_trios) + 1)
            )

        # run insertion SQL for "karyotype_rank"
        #    do not alter "coord_system_tag" anyhow in the case of the "chromosome" coord_system
        self.insert_to_db(
            rank_insertions_trios,
            "seq_region_attrib",
            ["seq_region_id", "attrib_type_id", "value"],
            self.pjc(work_dir, "karyotype_rank_insertion"),
            ignore=True,
        )

        return chromomes_seq_regions

    def add_karyotype_rank_from_bands_info(
        self,
        regions_with_karyotype_bands: list,  # [ (seq_region_name, seq_region_id, unversioned_name) ]
        other_regions_with_ranks: list,  # [ (seq_region_name, seq_region_id, unversioned_name) ]
        attrib_type_map: dict,
        work_dir: str,
    ):
        """
        Add karyotype_ranks for `regions_with_karyotype_bands` (those with karyotype bands in seq_region metadata) but not present in `other_regions_with_ranks` list.

        Too close to the DB schema.
        """
        # form set of used seq_region_id(s)
        regions_with_ranks = frozenset(map(lambda el: el[1], other_regions_with_ranks))

        # get set of seq_region_ids with bands
        regions_with_bands = set(map(lambda el: el[1], regions_with_karyotype_bands))

        # seq_region_ids list to add ranks for
        region_ids_with_bands_but_no_karyotype_ranks = sorted(list(regions_with_bands - regions_with_ranks))

        # return if nothing to add
        if not region_ids_with_bands_but_no_karyotype_ranks:
            return

        # technical / optimization. get external_db_id for "karyotype_rank"
        karyotype_rank_attrib_id = self.id_from_map_or_die(
            "karyotype_rank", attrib_type_map, "attrib_type_map"
        )

        # set/update proper attributes for "chromomosome" regions
        rank_insertions_trios = []
        for seq_region_id in region_ids_with_bands_but_no_karyotype_ranks:
            rank_insertions_trios.append(
                (
                    seq_region_id,
                    karyotype_rank_attrib_id,
                    len(rank_insertions_trios) + 1 + len(regions_with_ranks),
                )
            )

        # run insertion SQL for "karyotype_rank"
        #    do not alter "coord_system_tag" anyhow in the case of the "chromosome" coord_system
        self.insert_to_db(
            rank_insertions_trios,
            "seq_region_attrib",
            ["seq_region_id", "attrib_type_id", "value"],
            self.pjc(work_dir, "karyotype_rank_insertion"),
            ignore=True,
        )

    def unversion_scaffolds(self, cs_rank, logs):
        """
        Unversion scaffold, remove ".\d$" from seq_region.names if there's a need

        Non-versioned syns for contigs (lower, sequence level), versioned for the rest.
        """
        # coord_systems to ignore when adding synonyms for sequence_level cs
        artificial_cs = frozenset(self.param("artificial_cs").split(","))
        seq_cs, max_rank = max(
            [(c, r) for c, r in cs_rank.items() if c not in artificial_cs], key=lambda k: k[1]
        )
        for cs in cs_rank:
            if cs == seq_cs:
                # for non-sequence level cs, store original name (with version) as "sr_syn_src" synonym
                xdb = self.param("sr_syn_src")
                self.copy_sr_name_to_syn(cs, xdb, self.pjc(logs, "cp2syn", cs))
                self.sr_name_unversion(cs, "seq_region_synonym", "synonym", self.pjc(logs, "unv_srs", cs))
            else:
                # for sequence-level cs, store original name (with version) as "versioned_sr_syn_src" synonym
                xdb = self.param("versioned_sr_syn_src")
                self.copy_sr_name_to_syn(cs, xdb, self.pjc(logs, "cp2syn", cs))
                self.sr_name_unversion(cs, "seq_region", "name", self.pjc(logs, "unv_sr", cs))

    def coord_sys_order(self, cs_order_str):
        cs_order_lst = map(lambda x: x.strip(), cs_order_str.split(","))
        return {e: i for i, e in enumerate(filter(lambda x: len(x) > 0, cs_order_lst))}

    def used_cs_ranks(self, agps, cs_order, noagp_default=None):
        # rank cs_names, met in agps.keys ("-" separated), i.e. "scaffold-contig"
        #   only agps keys used, values are ignored
        #   use noagp_cs_name_default for "noagp" assemblies
        if agps is None:
            if noagp_default is None:
                raise Exception("NoAGP assembly with no default coordinate system name")
            cs_used_set = frozenset([noagp_default])
        else:
            cs_used_set = frozenset(sum(map(lambda x: x.split("-"), agps.keys()), []))

        cs_unknown = cs_used_set.difference(cs_order.keys())
        if len(cs_unknown) > 0:
            raise Exception("Unknown coordinate system(s) %s" % {str(cs_unknown)})
        return {e: i for i, e in enumerate(sorted(cs_used_set, key=lambda x: -cs_order[x]), start=1)}

    def chunk_contigs(self, fasta, cs_ranks, agps, work_dir, chunk_size=0, chunks_cs_name="ensembl_internal"):
        """
        chunk dna sequence fasta
          no chunking if chunk_size < 50k
        """
        chunk_size_min_len = self.param_required("sequence_data_chunk_min_len")
        if chunk_size < chunk_size_min_len:
            return fasta, cs_ranks, agps

        # split using script
        os.makedirs(work_dir, exist_ok=True)
        _stderr = f"{work_dir}/chunking.stderr"
        _out_agp = f"{work_dir}/chunks.agp"
        _out_fasta = f"{work_dir}/chunks.fasta"

        _splitter = r"fasta_chunk"
        split_cmd = f"{_splitter} --chunk_size {chunk_size} --agp_out {_out_agp} --out {_out_fasta} --fasta_dna {fasta} 2> {_stderr}"

        print(f"running {split_cmd}", file=sys.stderr)
        # NB throws CalledProcessError if failed
        sp.run(split_cmd, shell=True, check=True)

        # add rank for chunks
        _cs_name, _cs_rank = sorted(cs_ranks.items(), key=lambda k: k[1])[-1]
        cs_ranks[chunks_cs_name] = _cs_rank + 1

        # add agps entry
        if agps is None:
            agps = dict()
        agps[f"{_cs_name}-{chunks_cs_name}"] = _out_agp

        return _out_fasta, cs_ranks, agps

    def prune_agps(self, agps, cs_order, agps_pruned_dir, pruning=True):
        # when loading agp sort by:
        #   highest component (cmp) cs level (lowest rank)
        #   lowest difference between cs ranks (asm - cmp)
        #   i.e: chromosome-scaffold scaffold-chunk chromosome-chunk
        #   if no agps return empty pruned result

        if not agps:
            return None

        agp_levels_sorted = self.order_agp_levels(agps, cs_order)

        # prune agps
        agps_pruned = dict()
        used_components = set()
        if not pruning:
            used_components = None
        for asm_cmp in agp_levels_sorted:
            agp_file_src = agps[asm_cmp]
            agp_file_dst = self.pjc(agps_pruned_dir, asm_cmp + ".agp")
            if self.agp_prune(agp_file_src, agp_file_dst, used_components) > 0:
                agps_pruned[asm_cmp] = agp_file_dst
        return agps_pruned

    def order_agp_levels(self, agps, cs_order):
        # sort agp for loading by:
        #   highest component (cmp) cs level (lowest rank)
        #   lowest difference between cs ranks (asm - cmp)
        #   i.e: chromosome-scaffold scaffold-chunk chromosome-chunk
        if not agps:
            return []

        agp_cs_pairs = list(map(lambda x: [x] + x.split("-"), agps.keys()))
        agp_levels = [(x[0], cs_order[x[1]], cs_order[x[2]]) for x in agp_cs_pairs]

        bad_agps = list(filter(lambda x: x[1] < x[2], agp_levels))
        if len(bad_agps) > 0:
            raise Exception("component cs has higher order than assembled cs %s" % (str(bad_agps)))

        agp_levels_sorted = [e[0] for e in sorted(agp_levels, key=lambda x: (-x[2], x[1] - x[2]))]
        return agp_levels_sorted

    def load_seq_data(self, fasta, agps, cs_rank, log_pfx):
        """loads sequence data for various coordinate systems accordingly with their rank"""
        asm_v = self.asm_name()

        sequence_rank = max(cs_rank.values())
        for cs, rank in sorted(cs_rank.items(), key=lambda p: -p[1]):
            logs = self.pjc(log_pfx, "%02d_%s" % (rank, cs))
            if rank == sequence_rank:
                self.load_cs_data(cs, rank, "fasta", asm_v, fasta, logs, loaded_regions=None, seq_level=True)
            else:
                useful_agps = list(filter(lambda x: cs in x, agps and agps.keys() or []))
                if len(useful_agps) == 0:
                    raise Exception("non-seq_level cs %s has no agps to assemble it from" % (cs))
                loaded_regions = set()
                for pair, agp_file_pruned in map(lambda k: (k, agps[k]), useful_agps):
                    if not pair.startswith(cs + "-"):
                        continue
                    self.load_cs_data(cs, rank, pair, asm_v, agp_file_pruned, logs, loaded_regions)

    def load_cs_data(self, cs, rank, pair, asm_v, src_file, log_pfx, loaded_regions=None, seq_level=False):
        """creates a coord_system and loads sequence or assembly(AGP) data for corresponding seqregions

        doesn't load already seen sequences
        """
        # NB load_seq_region.pl and load_agp.pl are not failing on parameter errors (0 exit code)
        os.makedirs(dirname(log_pfx), exist_ok=True)
        additional_load = self.param_bool("load_additional_sequences")
        if seq_level:
            self.load_seq_region(cs, rank, asm_v, src_file, log_pfx, seq_level, additional_load)
        elif loaded_regions is not None:
            new_regions = set()
            clean_file = src_file + ".regions_deduped"
            self.filter_already_loaded_regions_from_agp(src_file, clean_file, loaded_regions, new_regions)
            self.load_seq_region(cs, rank, asm_v, clean_file, log_pfx, seq_level, additional_load)
            loaded_regions.update(new_regions)
        if not seq_level:
            self.load_agp(pair, asm_v, src_file, log_pfx)

    def filter_already_loaded_regions_from_agp(self, src_file, dst_file, loaded_regions, new_regions):
        with open(src_file) as src:
            with open(dst_file, "w") as dst:
                for line in src:
                    fields = line.strip().split("\t")
                    (
                        asm_id,
                        asm_start,
                        asm_end,
                        asm_part,
                        type_,
                        cmp_id,
                        cmp_start,
                        cmp_end,
                        cmp_strand,
                    ) = fields
                    if type_ in "NU" or asm_id in loaded_regions:
                        continue
                    new_regions.add(asm_id)
                    print(line.strip(), file=dst)

    def agp_prune(self, from_file: str, to_file: str, used: set = None):
        """
        Remove already components from the AGP file if they are seen in "used" set
        """
        # reomve used component
        #   and GAPS as they are not used by 'ensembl-analysis/scripts/assembly_loading/load_agp.pl'
        os.makedirs(dirname(to_file), exist_ok=True)
        open_ = self.is_gz(from_file) and gzip.open or open
        if used is None:
            cmd = r"""{_cat} {_file} > {_out}""".format(
                _cat=self.is_gz(from_file) and "zcat" or "cat", _file=from_file, _out=to_file
            )
            print("running %s" % (cmd), file=sys.stderr)
            sp.run(cmd, shell=True, check=True)
            return 1
        writes = 0
        with open_(from_file, "r") as src:
            with open(to_file, "w") as dst:
                for line in src:
                    fields = line.strip().split("\t")
                    (
                        asm_id,
                        asm_start,
                        asm_end,
                        asm_part,
                        type_,
                        cmp_id,
                        cmp_start,
                        cmp_end,
                        cmp_strand,
                    ) = fields
                    if type_ in "NU" or cmp_id in used:
                        continue
                    used.add(cmp_id)
                    print(line.strip(), file=dst)
                    writes += 1
        return writes

    def get_external_db_mapping(self) -> dict:
        """
        Get a map from a file for external_dbs to Ensembl dbnames from "external_db_map" module(!) param
        """
        external_map_path = self.param("external_db_map")
        db_map = dict()
        if external_map_path is None:
            return db_map

        # Load the map
        with open(external_map_path, "r") as map_file:
            for line in map_file:
                if line.startswith("#"):
                    continue
                line = re.sub(r"#.*", "", line)
                if re.match(r"^\s*$", line):
                    continue
                (from_name, to_name, *rest) = line.strip().split("\t")
                if len(rest) > 0 and rest[0].upper() != "SEQ_REGION":
                    continue
                if to_name == "_IGNORE_":
                    continue
                db_map[from_name] = to_name
        return db_map

    # UTILS
    def db_string(self):
        return "-dbhost {host_} -dbport {port_} -dbuser {user_} -dbpass {pass_} -dbname {dbname_} ".format(
            host_=self.param("dbsrv_host"),
            port_=self.param("dbsrv_port"),
            user_=self.param("dbsrv_user"),
            pass_=self.param("dbsrv_pass"),
            dbname_=self.param("db_name"),
        )

    def pjc(self, *parts: list) -> str:
        """
        Join path parts and try to create every directory but the last one.
        """
        if not parts:
            return None

        parts = list(parts)
        last = parts.pop()
        prefix = pj(*parts)

        os.makedirs(prefix, exist_ok=True)

        return pj(prefix, last)

    def is_gz(self, filename):
        return filename.endswith(".gz")

    def asm_name(self):
        asm = self.from_param("genome_data", "assembly")
        if "name" not in asm:
            raise Exception("no assembly/name in genome_data")
        return asm["name"]

    # TODO: add some metafunc setter getter
    def from_param(self, param, key, not_throw=False):
        data = self.param_required(param)
        if key not in data:
            if not_throw:
                return None
            else:
                raise Exception("Missing required %s data: %s" % (param, key))
        return data[key]

    def param_bool(self, param):
        val = self.param(param)
        return bool(val) and "0" != val

    def load_map_from_sql_stdout(self, in_file, skip_header=False):
        """
        Load map from the SQL output

        Process input in_file with "key  value" pairs and load then
        into the {key : value} map.
        Skips header if skip_header.
        """
        data = dict()
        with open(in_file) as pairs_file:
            for line in pairs_file:
                if skip_header:
                    skip_header = False
                    continue
                (key, val) = line.strip().split("\t")
                data[key] = val
        return data

    def name_and_id_from_seq_region_item(
        self,
        seq_region_item: dict,
        seq_region_map: dict,
        try_unversion: bool = False,
        throw_missing: bool = True,
    ) -> (str, str, str):
        """
        Get (seq_region_name, seq_region_id, unversioned_name) from seq_region_item struct(dict)

        Gets unversioned_name only if "try_unversion" is True.
        Throws exception if not able to get seq_region_id from "seq_region_map" and "throw_missing" is true.
        """
        #   get seq_region_id (perhaps, by using unversioned name)
        seq_region_name = seq_region_item["name"]
        seq_region_id = seq_region_map.get(seq_region_name, None)
        unversioned_name = None
        if seq_region_id is None and try_unversion:
            # try to get seq_region_id for the unversioned name
            unversioned_name = re.sub(r"\.\d+$", "", seq_region_name)
            seq_region_id = seq_region_map.get(unversioned_name, "")

        # oops, we don't know such seq_region name
        if not seq_region_id and throw_missing:
            raise Exception(f"Not able to find seq_region for '{seq_region_name}'")

        return (seq_region_name, seq_region_id, unversioned_name)

    def id_from_map_or_die(self, key: str, map_dict: dict, name_for_panic):
        value = map_dict.get(key, None)
        if value is None:
            raise Exception(f"no such key '{key}' in '{name_for_panic}' map")
        return value

    ## Utilities using external scripts
    def remove_IUPAC(self, from_file: str, to_file: str):
        """remove non-valid symbols from FASTA file (using sed) ans store the result in a different location"""
        IUPAC = self.param("IUPAC")
        os.makedirs(dirname(to_file), exist_ok=True)
        cmd = r"""{_cat} {_file} | sed -r '/^[^>]/ {{ s/[{_IUPAC}]/N/g; s/{_iupac}/n/g }}' > {_out}""".format(
            _cat=self.is_gz(from_file) and "zcat" or "cat",
            _file=from_file,
            _IUPAC=IUPAC.upper(),
            _iupac=IUPAC.lower(),
            _out=to_file,
        )
        print("running %s" % (cmd), file=sys.stderr)
        return sp.run(cmd, shell=True, check=True)

    def load_seq_region(
        self,
        cs: str,
        rank: str,
        asm_v: str,
        src_file: str,
        log_pfx: str,
        seq_level=False,
        additional_load=False,
    ):
        """ensembl-analysis script (load_seq_region.pl) based utility for loading seq_regions FASTA sequences"""
        en_root = self.param_required("ensembl_root_dir")
        cmd = (
            r"""{_loader} {_db_string} {_asm_v_flag} -default_version -ignore_ambiguous_bases """
            + r"""    -rank {_rank} -coord_system_name {_cs} {_sl_flag} -{_tag}_file {_file}"""
            + r"""     > {_log}.stdout 2> {_log}.stderr"""
        ).format(
            _loader="perl %s"
            % (self.pjc(en_root, r"ensembl-analysis/scripts/assembly_loading/load_seq_region.pl")),
            _db_string=self.db_string(),
            _asm_v_flag=not additional_load and f"-coord_system_version {asm_v}" or "",
            _rank=rank,
            _cs=cs,
            _sl_flag=seq_level and "-sequence_level" or "",
            _tag=seq_level and "fasta" or "agp",
            _file=src_file,
            _log="%s_seq" % (log_pfx),
        )
        print("running %s" % (cmd), file=sys.stderr)
        return sp.run(cmd, shell=True, check=True)

    def load_agp(self, pair, asm_v, src_file, log_pfx):
        """ensembl script (load_agp.pl) based utility for loading seq_regions assembly data (AGPs)"""
        en_root = self.param_required("ensembl_root_dir")
        (asm_n, cmp_n) = pair.strip().split("-")
        cmd = (
            r"""{_loader} {_db_string} -assembled_version {_asm_v} """
            + r"""    -assembled_name {_asm} -component_name {_cmp} """
            + r"""    -agp_file {_file} """
            + r"""    > {_log}.stdout 2> {_log}.stderr"""
        ).format(
            _loader="perl %s" % (self.pjc(en_root, r"ensembl-analysis/scripts/assembly_loading/load_agp.pl")),
            _db_string=self.db_string(),
            _asm_v=asm_v,
            _asm=asm_n,
            _cmp=cmp_n,
            _file=src_file,
            _log="%s_agp_%s" % (log_pfx, pair.replace("-", "_")),
        )
        print("running %s" % (cmd), file=sys.stderr)
        return sp.run(cmd, shell=True, check=True)

    def set_toplevel(self, log_pfx, ignored_cs=[]):
        """
        Set toplevel(6) seq_region_attrib using ensembl script.

        Uses set_toplevel.pl ensembl script.
        """
        # set top_level(6) seq_region_attrib
        os.makedirs(dirname(log_pfx), exist_ok=True)
        en_root = self.param_required("ensembl_root_dir")
        cmd = (
            r"""{_set_tl} {_db_string} {_ignored_cs} """ + r"""     > {_log}.stdout 2> {_log}.stderr"""
        ).format(
            _set_tl="perl %s"
            % (self.pjc(en_root, r"ensembl-analysis/scripts/assembly_loading/set_toplevel.pl")),
            _db_string=self.db_string(),
            _ignored_cs=" ".join(map(lambda x: "-ignore_coord_system %s" % (x), ignored_cs)),
            _log=log_pfx,
        )
        print("running %s" % (cmd), file=sys.stderr)
        sp.run(cmd, shell=True, check=True)

        # remove toplevel attribute for seq_regions that are components
        self.remove_components_from_toplevel(log_pfx)

    ## SQL executor and utilities using plain SQL
    def run_sql_req(self, sql, log_pfx, from_file=False):
        os.makedirs(dirname(log_pfx), exist_ok=True)

        sql_option = r""" -sql '{_sql}' """.format(_sql=sql)
        if from_file:
            sql_option = r""" < '{_sql}' """.format(_sql=sql)

        cmd = r"""{_dbcmd} -url "{_srv}{_dbname}" {_sql_option} > {_out} 2> {_err}""".format(
            _dbcmd="perl %s/scripts/db_cmd.pl" % os.getenv("EHIVE_ROOT_DIR"),
            _srv=self.param("dbsrv_url"),
            _dbname=self.param("db_name"),
            _sql_option=sql_option,
            _out=log_pfx + ".stdout",
            _err=log_pfx + ".stderr",
        )
        print("running %s" % (cmd), file=sys.stderr)
        return sp.run(cmd, shell=True, check=True)

    def add_contig_ena_attrib(self, log_pfx, cs_name="contig"):
        """
        Add ENA attrib for contigs if their names are ENA accessions

        Nno sequence_level checks are used -- just cs name.
        See ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/SeqRegionNamesINSDC.pm .
        SQL code.
        """
        sql = r"""insert ignore into seq_region_attrib (seq_region_id, attrib_type_id, value)
                select
                  sr.seq_region_id, at.attrib_type_id, "ENA"
                from
                  seq_region sr, coord_system cs, attrib_type at
                where   sr.coord_system_id = cs.coord_system_id
                    and cs.name = "%s"
                    and at.code = "external_db"
              ;""" % (
            cs_name
        )
        return self.run_sql_req(sql, log_pfx)

    def copy_sr_name_to_syn(self, cs, x_db, log_pfx):
        """
        Store original seq_region names as seq_region_synonym

        Store original seq_region names (from a given cood_systen, "cs" param) as seq_region_synonyms (using "x_db" external source name)
        SQL code.
        """
        asm_v = self.asm_name()
        sql = r"""insert into seq_region_synonym (seq_region_id, synonym, external_db_id)
                  select
                      sr.seq_region_id, sr.name, xdb.external_db_id
                  from
                     seq_region sr, external_db xdb, coord_system cs
                  where   xdb.db_name = "%s"
                      and sr.coord_system_id = cs.coord_system_id
                      and cs.name = "%s"
                      and cs.version = "%s"
                      and sr.name like "%%._"
                ;""" % (
            x_db,
            cs,
            asm_v,
        )
        return self.run_sql_req(sql, log_pfx)

    def add_asm_mappings(self, cs_pairs, log_pfx):
        """
        Adds "assembly.mapping" strings to meta table.

        Nullifies asm_mappings contig versions as well, but don't nullify toplevel
        Doesn't add mapping id there is a single CS
        SQL code.
        """
        if cs_pairs is None or len(cs_pairs) < 1:
            return
        asm_v = self.asm_name()
        for pair in cs_pairs:
            higher, lower = pair.strip().split("-")
            sql = r"""insert ignore into meta (species_id, meta_key, meta_value) values
                    (1, "assembly.mapping", "{_higher}:{_v}|{_lower}:{_v}")
                  ;""".format(
                _v=asm_v, _higher=higher, _lower=lower
            )
            self.run_sql_req(sql, self.pjc(log_pfx, pair))

    def remove_components_from_toplevel(self, log_pfx):
        """
        Remove toplevel attribute for seq_regions that are "components" (parts of different seq_regions).

        SQL code.
        """

        # get list of seq_regions that are components
        sql_not_toplevel_list = r"""
          select distinct a.cmp_seq_region_id, sr_c.name, cs_c.name
            from assembly a,
                 seq_region sr_c, seq_region sr_a,
                 coord_system cs_c, coord_system cs_a
            where a.cmp_seq_region_id = sr_c.seq_region_id
              and a.asm_seq_region_id = sr_a.seq_region_id
              and sr_c.coord_system_id = cs_c.coord_system_id
              and sr_a.coord_system_id = cs_a.coord_system_id
              and cs_c.attrib like "%default_version%"
              and cs_a.attrib like "%default_version%"
              and sr_c.seq_region_id in (
                select distinct seq_region_id
                  from seq_region_attrib
                  where attrib_type_id = 6 and value = 1
              )
        """
        # perhaps, make sense to check sr_c.coord_system_id != sr_a.coord_system_id
        self.run_sql_req(sql_not_toplevel_list, ".".join([log_pfx, "not_toplevel_list"]))

        # delete wrongly assigned attribs
        sql_not_toplevel_delete = r"""
          delete from seq_region_attrib
            where attrib_type_id = 6 and value = 1
              and seq_region_id in (
                select distinct a.cmp_seq_region_id
                  from assembly a,
                       seq_region sr_c, seq_region sr_a,
                       coord_system cs_c, coord_system cs_a
                  where a.cmp_seq_region_id = sr_c.seq_region_id
                    and a.asm_seq_region_id = sr_a.seq_region_id
                    and sr_c.coord_system_id = cs_c.coord_system_id
                    and sr_a.coord_system_id = cs_a.coord_system_id
                    and cs_c.attrib like "%default_version%"
                    and cs_a.attrib like "%default_version%"
              );
        """
        # perhaps, check sr_c.coord_system_id != sr_a.coord_system_id
        self.run_sql_req(sql_not_toplevel_delete, ".".join([log_pfx, "not_toplevel_delete"]))

    def sr_name_unversion(self, cs, tbl, fld, log_pfx):
        """
        Remove version suffix from the seq_region names

        Removes '\.\d+$' suffices from the seq_region names
        SQL code.
        """
        # select synonym, substr(synonym,  1, locate(".", synonym, length(synonym)-2)-1)
        #     from seq_region_synonym  where synonym like "%._"
        asm_v = self.asm_name()
        sql = r"""update {_tbl} t, seq_region sr, coord_system cs
                    set
                      t.{_fld} = substr(t.{_fld},  1, locate(".", t.{_fld}, length(t.{_fld})-2)-1)
                    where t.{_fld} like "%._"
                      and t.seq_region_id = sr.seq_region_id
                      and sr.coord_system_id = cs.coord_system_id
                      and cs.name = "{_cs}"
                      and cs.version = "{_asm_v}"
                ;""".format(
            _tbl=tbl, _fld=fld, _cs=cs, _asm_v=asm_v
        )
        return self.run_sql_req(sql, log_pfx)

    def nullify_ctg_cs_version(self, cs_order, log_pfx: str):
        """
        Nullify every CS version with rank larger than that of "contig", but don't nullify toplevel ones.

        SQL code
        """
        asm_v = self.asm_name()
        # get cs_info (and if they have toplevel regions)
        sql = r"""select cs.coord_system_id as coord_system_id,
                         cs.name, cs.rank, (tl.coord_system_id is NULL) as no_toplevel
                    from coord_system cs
                      left join (
                        select distinct sr.coord_system_id
                          from seq_region sr, seq_region_attrib sra, attrib_type at
                          where at.code = "toplevel"
                            and sra.attrib_type_id = at.attrib_type_id
                            and sra.value = 1
                            and sra.seq_region_id = sr.seq_region_id
                      ) as tl on tl.coord_system_id = cs.coord_system_id
                    where cs.version = "{_asm_v}"
                    order by rank
              ;""".format(
            _asm_v=asm_v
        )
        # run_sql
        toplvl_pfx = self.pjc(log_pfx, "toplvl_info")
        self.run_sql_req(sql, toplvl_pfx)
        # load info
        cs_info = []
        with open(toplvl_pfx + ".stdout") as f:
            header = None
            for line in f:
                if header is None:
                    header = line.strip().split("\t")
                    continue
                cs_info.append(dict(zip(header, line.strip().split())))
        # return if there's no coord_systems to nullify versions for
        if not cs_info:
            return
        # get list of known cs from cs_order to clean version from
        nullify_cs_version_from = self.param("nullify_cs_version_from")
        if not nullify_cs_version_from or nullify_cs_version_from not in cs_order:
            return
        cs_thr_index = cs_order[nullify_cs_version_from]
        cs_names_to_keep_ver = frozenset([nm for (nm, ind) in cs_order.items() if ind > cs_thr_index])

        # choose cs rank threshold to start clearing version from
        clear_lst = [
            (cs["coord_system_id"], cs["name"])
            for cs in cs_info
            if (bool(int(cs["no_toplevel"])) and cs["name"] not in cs_names_to_keep_ver)
        ]

        # run sql
        if clear_lst:
            clear_pfx = self.pjc(log_pfx, "clear")
            with open(clear_pfx + ".sql", "w") as clear_sql:
                for cs_id, cs_name in clear_lst:
                    sql = r"""
                        update meta set
                            meta_value=replace(meta_value, "|{_cs_name}:{_asm_v}", "|{_cs_name}")
                            where meta_key="assembly.mapping";
                        update coord_system set version = NULL where coord_system_id = {_cs_id};
                    """.format(
                        _asm_v=asm_v, _cs_name=cs_name, _cs_id=cs_id
                    )
                    print(sql, file=clear_sql)
            self.run_sql_req(clear_pfx + ".sql", clear_pfx, from_file=True)

    def load_map_from_core_db(self, table, cols, work_dir) -> dict:
        """
        Load 2 "cols" from core db "table" as map

        Load { cols[0] : cols[1] } map from the core db "table"
        SQL code
        """
        out_pfx = self.pjc(work_dir, f"{table}_map")
        sql = f"""select {cols[0]}, {cols[1]} FROM {table};"""
        res = self.run_sql_req(sql, out_pfx)

        out_file = out_pfx + ".stdout"
        data = self.load_map_from_sql_stdout(out_file, skip_header=True)
        if not data:
            raise Exception(f"No '{table}' map loaded from '{out_file}'")
        return data

    def load_seq_region_synonyms_trios_from_core_db(self, work_dir: str) -> list:
        # was get_db_syns
        """
        Load seq_region_synonyms from from core db into [(seq_region_id, name, synonym)...] list

        SQL code
        """
        out_pfx = self.pjc(work_dir, f"seq_region_synonyms")
        sql = r"""select sr.seq_region_id as seq_region_id, sr.name, srs.synonym
                 from seq_region sr left join seq_region_synonym srs
                 on sr.seq_region_id = srs.seq_region_id
                 order by sr.seq_region_id
              ;"""

        res = self.run_sql_req(sql, out_pfx)

        syn_trios = []
        out_file = out_pfx + ".stdout"
        with open(out_file) as syns_file:
            skip_header = True
            for line in syns_file:
                if skip_header:
                    skip_header = False
                    continue
                (sr_id, name, syn) = line.strip().split("\t")
                syn_trios.append((sr_id, name, syn))
        return syn_trios

    def insert_to_db(
        self, list_of_tuples: list, table_name: str, col_names: list, work_dir: str, ignore: bool = True
    ):
        """
        Insert into the core db's {table_name} tuples from {list_of_tuples} as col_names.

        Use `quote_or_null` (see definition below) method for string values, when putting values into `list_of_tuples`
        SQL code
        """
        # return if nothing to do
        if not list_of_tuples:
            return

        # prepare request parts
        ignore_str = ignore and "IGNORE" or ""
        cols_str = ", ".join(col_names)

        # generate file with the insert SQL command
        insert_sql_file = self.pjc(work_dir, "insert.sql")
        with open(insert_sql_file, "w") as sql:
            print(f"INSERT {ignore_str} INTO {table_name} ({cols_str}) VALUES", file=sql)
            values_sep = ""
            for tpl in list_of_tuples:
                tpl_str = ", ".join(map(str, tpl))
                print(f"{values_sep}({tpl_str})", file=sql)
                values_sep = ", "
            print(";", file=sql)

        # run insert SQL from file
        self.run_sql_req(insert_sql_file, self.pjc(work_dir, "insert"), from_file=True)

    def quote_or_null(self, val: str, quotes: str = "'", null: str = "NULL", strings_only=True) -> str:
        """
        Return `val` wrapped in `quotes` or `null` value

        Quotes only strings (instances of `str`) if strings_only is True.
        """
        if val is None:
            return null
        if strings_only and isinstance(val, str):
            return f"{quotes}{val}{quotes}"
        return val

    def update_db_single_group(
        self, dict_of_col_to_value: dict, table_name: str, work_dir: str, where: str = None
    ):
        """
        Update given `table` name in db; set `col = val` for all key/value pairs from `dict_of_cols_to_values`

        If `where` condition is present its value is used for the "WHERE" SQL clause.
        Use `quote_or_null` (see definition below) method for string values, when putting values into `list_of_tuples`

        SQL code
        """
        # return if nothing to do
        if not dict_of_col_to_value:
            return

        # prepare request parts
        where_str = where and f"WHERE {where}" or ""
        col_val_str = ", ".join([f"{col} = {val}" for col, val in dict_of_col_to_value.items()])

        # generate file with the insert SQL command
        update_sql_file = self.pjc(work_dir, "update.sql")
        with open(insert_sql_file, "w") as sql:
            print(f"UPDATE {table_name} SET {col_val_str} {where_str};", file=sql)
            values_sep = ""
            for tpl in list_of_tuples:
                tpl_str = ", ".join(map(str, tpl))
                print(f"{values_sep}({tpl_str})", file=sql)
                values_sep = ", "
            print(";", file=sql)

        # run insert SQL from file
        self.run_sql_req(update_sql_file, self.pjc(work_dir, "update"), from_file=True)

    def get_toplevel_from_cs(self, coord_system_name, work_dir) -> list:
        """
        Returns list of [ (seq_region_name, seq_region_id, "") ] trios for toplevel seq_regions from coord system with `coord_system_name`
          or having  "coord_system_tag" attribute with the `coord_system_name` value

        SQL code
        """
        out_pfx = self.pjc(work_dir, f"toplevel_from_{coord_system_name}")
        sql = f"""SELECT DISTINCT sr.name, sr.seq_region_id
                FROM seq_region sr,
                     seq_region_attrib sra,
                     coord_system cs,
                     attrib_type at
                WHERE sr.seq_region_id = sra.seq_region_id
                  AND sr.coord_system_id = cs.coord_system_id
                  AND sra.attrib_type_id = at.attrib_type_id
                  AND (  ( cs.name = "{coord_system_name}" and at.code = "toplevel" )
                      OR ( at.code = "coord_system_tag" and sra.value = "{coord_system_name}" )
                      )
                  ORDER BY sr.seq_region_id;
               """

        res = self.run_sql_req(sql, out_pfx)

        sr_trios = []
        out_file = out_pfx + ".stdout"
        with open(out_file) as sr_file:
            skip_header = True
            for line in sr_file:
                if skip_header:
                    skip_header = False
                    continue
                (name, sr_id) = line.strip().split("\t")
                sr_trios.append((name, sr_id, ""))

        return sr_trios

add_asm_mappings(cs_pairs, log_pfx)

Adds "assembly.mapping" strings to meta table.

Nullifies asm_mappings contig versions as well, but don't nullify toplevel Doesn't add mapping id there is a single CS SQL code.

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
def add_asm_mappings(self, cs_pairs, log_pfx):
    """
    Adds "assembly.mapping" strings to meta table.

    Nullifies asm_mappings contig versions as well, but don't nullify toplevel
    Doesn't add mapping id there is a single CS
    SQL code.
    """
    if cs_pairs is None or len(cs_pairs) < 1:
        return
    asm_v = self.asm_name()
    for pair in cs_pairs:
        higher, lower = pair.strip().split("-")
        sql = r"""insert ignore into meta (species_id, meta_key, meta_value) values
                (1, "assembly.mapping", "{_higher}:{_v}|{_lower}:{_v}")
              ;""".format(
            _v=asm_v, _higher=higher, _lower=lower
        )
        self.run_sql_req(sql, self.pjc(log_pfx, pair))

add_contig_ena_attrib(log_pfx, cs_name='contig')

Add ENA attrib for contigs if their names are ENA accessions

Nno sequence_level checks are used -- just cs name. See ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/SeqRegionNamesINSDC.pm . SQL code.

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
def add_contig_ena_attrib(self, log_pfx, cs_name="contig"):
    """
    Add ENA attrib for contigs if their names are ENA accessions

    Nno sequence_level checks are used -- just cs name.
    See ensembl-datacheck/lib/Bio/EnsEMBL/DataCheck/Checks/SeqRegionNamesINSDC.pm .
    SQL code.
    """
    sql = r"""insert ignore into seq_region_attrib (seq_region_id, attrib_type_id, value)
            select
              sr.seq_region_id, at.attrib_type_id, "ENA"
            from
              seq_region sr, coord_system cs, attrib_type at
            where   sr.coord_system_id = cs.coord_system_id
                and cs.name = "%s"
                and at.code = "external_db"
          ;""" % (
        cs_name
    )
    return self.run_sql_req(sql, log_pfx)

add_karyotype_bands(seq_region_file, seq_region_map, attrib_type_map, work_dir, unversion=False, karyotype_bands_property='karyotype_bands')

Add karyotypic data from the seq_region metafile.

Add karyotypic data from the src/python/ensembl/io/genomio/data/schemas/seq_region.json compatible meta data file. Returns list of [ (seq_region_name, seq_region_id, unversioned_name) ] trios for seq_regions having karyotype bands info.

If unversion is true
  • the unversioned synonym would be used to get the seq_region_id from "seq_region_map" if possible

Too close to the DB schema.

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
def add_karyotype_bands(
    self,
    seq_region_file: str,
    seq_region_map: dict,
    attrib_type_map: dict,
    work_dir: str,
    unversion: bool = False,
    karyotype_bands_property="karyotype_bands",
) -> list:  # [ (seq_region_name, seq_region_id, unversioned_name) ]
    """
    Add karyotypic data from the seq_region metafile.

    Add karyotypic data from the src/python/ensembl/io/genomio/data/schemas/seq_region.json compatible meta data file.
    Returns list of [ (seq_region_name, seq_region_id, unversioned_name) ] trios for seq_regions having karyotype bands info.

    If unversion is true:
      * the unversioned synonym would be used to get the seq_region_id from "seq_region_map" if possible

    Too close to the DB schema.
    """
    os.makedirs(work_dir, exist_ok=True)

    # return if there's nothing to add
    if not seq_region_file:
        return

    # resulting list of seq regions with bands
    seq_regions_with_karyotype_bands = []  # [ ( seq_region_name, seq_region_id, unversioned_name )... ]

    # load BRC4/EBI name from seq_region file
    band_tuples = (
        []
    )  # [ (seq_region_id, seq_region_start, seq_region_end, band|"NULL", stain|"NULL")... ] list of tuples for inserting into db
    with open(seq_region_file) as in_file:
        seq_regions = list(json.load(in_file))
        for seq_region in filter(lambda sr: sr.get(karyotype_bands_property, False), seq_regions):
            # iterate through all seq_regions having non-empty "karyotype_bands"

            # get seq_region_id (perhaps, by using unversioned name)
            seq_region_name, seq_region_id, unversioned_name = self.name_and_id_from_seq_region_item(
                seq_region, seq_region_map, try_unversion=unversion
            )

            # append trio to the resulting list
            seq_regions_with_karyotype_bands.append((seq_region_name, seq_region_id, unversioned_name))

            # append bands to the band_tuples list
            for band in seq_region[karyotype_bands_property]:
                # print("BAND: " + str(band), file = sys.stderr)
                # coords
                seq_region_start = band["start"]
                seq_region_end = band["end"]
                # band_name and stain
                band_name = band.get("name", None)
                stain = band.get("stain", None)
                # special cases for stain
                structure = band.get("structure", None)
                if structure == "telomere":
                    stain = "TEL"
                elif structure == "centromere":
                    stain = "ACEN"

                # append tuple
                band_tuples.append(
                    (
                        seq_region_id,
                        seq_region_start,
                        seq_region_end,
                        self.quote_or_null(band_name),
                        self.quote_or_null(stain),
                    )
                )

    # run insertion SQL
    self.insert_to_db(
        band_tuples,
        "karyotype",
        ["seq_region_id", "seq_region_start", "seq_region_end", "band", "stain"],
        self.pjc(work_dir, "karyotype_insertion"),
        ignore=True,
    )

    # return resulting list of regions with bands trios
    return seq_regions_with_karyotype_bands

add_karyotype_data(seq_region_file, seq_region_map, attrib_type_map, work_dir, unversion=False)

Adds various karyotypic data from seq_region file and assembly metadata (if present).

Adds various karyotypic data from the src/python/ensembl/io/genomio/data/schemas/seq_region.json compatible meta data file and assembly metadata (if present).

If unversion is true
  • the unversioned synonym would be used to get the seq_region_id from "seq_region_map" if possible
Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
def add_karyotype_data(
    self,
    seq_region_file: str,
    seq_region_map: dict,
    attrib_type_map: dict,
    work_dir: str,
    unversion: bool = False,
):
    """
    Adds various karyotypic data from seq_region file and assembly metadata (if present).

    Adds various karyotypic data from the src/python/ensembl/io/genomio/data/schemas/seq_region.json compatible meta data file and assembly metadata (if present).

    If unversion is true:
      * the unversioned synonym would be used to get the seq_region_id from "seq_region_map" if possible
    """
    # add karyotyope bands data
    regions_with_karyotype_bands = self.add_karyotype_bands(
        seq_region_file,
        seq_region_map,
        attrib_type_map,
        self.pjc(work_dir, "karyotype_bands"),
        unversion=unversion,
    )

    # try to add karyotype ranks for regions listed in genome_data/assembly/chromosome_display_order metadata
    regions_with_ranks_from_assembly_metadata = self.add_karyotype_rank_based_on_assembly_metadata(
        seq_region_map,
        attrib_type_map,
        self.pjc(work_dir, "karyotype_ranks_from_meta"),
        unversion=unversion,
    )

    regions_with_ranks_from_chromosome_cs = []
    if not regions_with_ranks_from_assembly_metadata:
        # try to add karyotype_ranks for top-level regions from the "chromosome" coord_system
        regions_with_ranks_from_chromosome_cs = self.add_karyotype_rank_for_chromosomes(
            attrib_type_map, self.pjc(work_dir, "karyotype_ranks_for_chromosomes")
        )

    # make sure that regions with bands have karyotype_ranks
    self.add_karyotype_rank_from_bands_info(
        regions_with_karyotype_bands,
        regions_with_ranks_from_chromosome_cs + regions_with_ranks_from_assembly_metadata,
        attrib_type_map,
        self.pjc(work_dir, "karyotype_ranks_from_bands"),
    )

add_karyotype_rank_based_on_assembly_metadata(seq_region_map, attrib_type_map, work_dir, unversion=True)

Add karyotype_rank attributes for seq region data from based on metadata from the "genome_data" module parameter. Add only to the seq_regions with ids listed in the array corresponding to 'genome_data/assembly/chromosome_display_order'.

Set "coord_system_tag" attribute to the one listed in the "cs_tag_for_ordered" module param; or "chromosome" if param value is underfined. Force updating of the "coord_system_tags" if force_update_coord_system_tag module param is True.

Returns list of [ (seq_region_name, seq_region_id, unversioned_name) ] trios for seq_regions with updated karyotype_ranks.

If unversion is true
  • the unversioned synonym would be used to get the seq_region_id from "seq_region_map" if possible

Too close to the DB schema.

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
def add_karyotype_rank_based_on_assembly_metadata(
    self, seq_region_map: dict, attrib_type_map: dict, work_dir: str, unversion: bool = True
) -> list:  # [ (seq_region_name, seq_region_id, unversioned_name) ]
    """
    Add `karyotype_rank` attributes for seq region data from based on metadata from the "genome_data" module parameter.
    Add only to the seq_regions with ids listed in the array corresponding to 'genome_data/assembly/chromosome_display_order'.

    Set "coord_system_tag" attribute to the one listed in the "cs_tag_for_ordered" module param; or "chromosome" if param value is underfined.
    Force updating of the "coord_system_tags" if `force_update_coord_system_tag` module param is True.

    Returns list of [ (seq_region_name, seq_region_id, unversioned_name) ] trios for seq_regions with updated karyotype_ranks.

    If unversion is true:
      * the unversioned synonym would be used to get the seq_region_id from "seq_region_map" if possible

    Too close to the DB schema.
    """

    # resulting list of seq_region with karyotype_rank
    regions_with_ranks_from_assembly_metadata = (
        []
    )  # [ ( seq_region_name, seq_region_id, unversioned_name )... ]

    # get `chromosome_display_order` list from the assembly metadata
    assembly_metadata = self.from_param("genome_data", "assembly", not_throw=True) or dict()
    chromosome_display_order_list = assembly_metadata.get("chromosome_display_order", [])

    # technical / optimization. get external_db_id for "karyotype_rank" and "coord_system_tag"
    karyotype_rank_attrib_id = self.id_from_map_or_die(
        "karyotype_rank", attrib_type_map, "attrib_type_map"
    )
    coord_system_tag_attrib_id = self.id_from_map_or_die(
        "coord_system_tag", attrib_type_map, "attrib_type_map"
    )
    coord_system_tag = self.param("cs_tag_for_ordered") or "chromosome"
    force_update_coord_system_tag = self.param("force_update_coord_system_tag") or False

    # set/update proper attributes for `chromosome_display_order_list` regions
    rank_insertions_trios = []
    coord_system_tag_attrib_insertion_trios = []
    coord_system_tag_attrib_seq_region_update_ids = []

    for seq_region_name_raw in chromosome_display_order_list:
        # wrap seq_region_name_raw into seq_region struct { "name": seq_region_name_raw } and
        # get seq_region_id (perhaps, by using unversioned name)
        seq_region_name, seq_region_id, unversioned_name = self.name_and_id_from_seq_region_item(
            {"name": seq_region_name_raw}, seq_region_map, try_unversion=unversion
        )

        # append trio to the resulting list
        regions_with_ranks_from_assembly_metadata.append(
            (seq_region_name, seq_region_id, unversioned_name)
        )

        # filling insert lists for "karyotype_rank" and "coord_system_tag" attributes
        rank_insertions_trios.append(
            (seq_region_id, karyotype_rank_attrib_id, len(rank_insertions_trios) + 1)
        )
        coord_system_tag_attrib_insertion_trios.append(
            (seq_region_id, coord_system_tag_attrib_id, self.quote_or_null(coord_system_tag))
        )

        # filling update list for "coord_system_tag" with seq_region_ids
        coord_system_tag_attrib_seq_region_update_ids.append(seq_region_id)

    # run insertion SQL for "karyotype_rank"
    self.insert_to_db(
        rank_insertions_trios,
        "seq_region_attrib",
        ["seq_region_id", "attrib_type_id", "value"],
        self.pjc(work_dir, "karyotype_rank_insertion"),
        ignore=True,
    )

    # run insertion SQL for "coord_system_tag"
    self.insert_to_db(
        coord_system_tag_attrib_insertion_trios,
        "seq_region_attrib",
        ["seq_region_id", "attrib_type_id", "value"],
        self.pjc(work_dir, "coord_system_tag_insertion"),
        ignore=True,
    )

    # forcing update of the "coord_system_tag"
    if force_update_coord_system_tag and coord_system_tag_attrib_seq_region_update_ids:
        seq_region_ids_str = ",".join(map(str, coord_system_tag_attrib_seq_region_update_ids))
        self.update_db_single_group(
            {"value": self.quote_or_null(coord_system_tag)},
            "seq_region_attrib",
            self.pjc(work_dir, "coord_system_tag_update"),
            where=f"attrib_type_id = {coord_system_tag_attrib_id} and seq_region_id in ({seq_region_ids_str})",
        )

    # return resulting list of regions with bands trios
    return regions_with_ranks_from_assembly_metadata

add_karyotype_rank_for_chromosomes(attrib_type_map, work_dir, chromosome_coord_system_name='chromosome')

Add karyotype_rank attributes for seq region data from the "chromosome" coordinate system.

Returns list of [ (seq_region_name, seq_region_id, unversioned_name) ] trios for seq_regions with updated karyotype_ranks. Not altering "coord_system_tag" tag attributes.

If unversion is true
  • the unversioned synonym would be used to get the seq_region_id from "seq_region_map" if possible

Too close to the DB schema.

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
def add_karyotype_rank_for_chromosomes(
    self, attrib_type_map: dict, work_dir: str, chromosome_coord_system_name="chromosome"
) -> list:  # [ (seq_region_name, seq_region_id, unversioned_name) ]
    """
    Add `karyotype_rank` attributes for seq region data from the "chromosome" coordinate system.

    Returns list of [ (seq_region_name, seq_region_id, unversioned_name) ] trios for seq_regions with updated karyotype_ranks.
    Not altering "coord_system_tag" tag attributes.

    If unversion is true:
      * the unversioned synonym would be used to get the seq_region_id from "seq_region_map" if possible

    Too close to the DB schema.
    """

    # resulting list of seq_region with karyotype_rank
    #   list of top level seq regions from the `chromosome_coord_system_name`
    chromomes_seq_regions = self.get_toplevel_from_cs(
        chromosome_coord_system_name, self.pjc(work_dir, "chromosome_seq_regions")
    )

    if not chromomes_seq_regions:
        return chromomes_seq_regions

    # technical / optimization. get external_db_id for "karyotype_rank"
    karyotype_rank_attrib_id = self.id_from_map_or_die(
        "karyotype_rank", attrib_type_map, "attrib_type_map"
    )

    # set/update proper attributes for "chromomosome" regions
    rank_insertions_trios = []
    for _, seq_region_id, _ in chromomes_seq_regions:
        rank_insertions_trios.append(
            (seq_region_id, karyotype_rank_attrib_id, len(rank_insertions_trios) + 1)
        )

    # run insertion SQL for "karyotype_rank"
    #    do not alter "coord_system_tag" anyhow in the case of the "chromosome" coord_system
    self.insert_to_db(
        rank_insertions_trios,
        "seq_region_attrib",
        ["seq_region_id", "attrib_type_id", "value"],
        self.pjc(work_dir, "karyotype_rank_insertion"),
        ignore=True,
    )

    return chromomes_seq_regions

add_karyotype_rank_from_bands_info(regions_with_karyotype_bands, other_regions_with_ranks, attrib_type_map, work_dir)

Add karyotype_ranks for regions_with_karyotype_bands (those with karyotype bands in seq_region metadata) but not present in other_regions_with_ranks list.

Too close to the DB schema.

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
def add_karyotype_rank_from_bands_info(
    self,
    regions_with_karyotype_bands: list,  # [ (seq_region_name, seq_region_id, unversioned_name) ]
    other_regions_with_ranks: list,  # [ (seq_region_name, seq_region_id, unversioned_name) ]
    attrib_type_map: dict,
    work_dir: str,
):
    """
    Add karyotype_ranks for `regions_with_karyotype_bands` (those with karyotype bands in seq_region metadata) but not present in `other_regions_with_ranks` list.

    Too close to the DB schema.
    """
    # form set of used seq_region_id(s)
    regions_with_ranks = frozenset(map(lambda el: el[1], other_regions_with_ranks))

    # get set of seq_region_ids with bands
    regions_with_bands = set(map(lambda el: el[1], regions_with_karyotype_bands))

    # seq_region_ids list to add ranks for
    region_ids_with_bands_but_no_karyotype_ranks = sorted(list(regions_with_bands - regions_with_ranks))

    # return if nothing to add
    if not region_ids_with_bands_but_no_karyotype_ranks:
        return

    # technical / optimization. get external_db_id for "karyotype_rank"
    karyotype_rank_attrib_id = self.id_from_map_or_die(
        "karyotype_rank", attrib_type_map, "attrib_type_map"
    )

    # set/update proper attributes for "chromomosome" regions
    rank_insertions_trios = []
    for seq_region_id in region_ids_with_bands_but_no_karyotype_ranks:
        rank_insertions_trios.append(
            (
                seq_region_id,
                karyotype_rank_attrib_id,
                len(rank_insertions_trios) + 1 + len(regions_with_ranks),
            )
        )

    # run insertion SQL for "karyotype_rank"
    #    do not alter "coord_system_tag" anyhow in the case of the "chromosome" coord_system
    self.insert_to_db(
        rank_insertions_trios,
        "seq_region_attrib",
        ["seq_region_id", "attrib_type_id", "value"],
        self.pjc(work_dir, "karyotype_rank_insertion"),
        ignore=True,
    )

add_sr_attribs(seq_region_file, seq_region_map, attrib_type_map, work_dir, unversion=False)

Add seq_region_attrib(s) from the seq_region_file meta data file.

Explicit list is taken from "sr_attrib_types" module param.

Add seq_region_attrib(s) from the src/python/ensembl/io/genomio/data/schemas/seq_region.json compatible meta data file. Explicit list is taken from "sr_attrib_types" module param.

"sr_attrib_types" defines { json_property -> attrib_type.name } map. If the value is dict, its keys are treated as "/"-delimited "json_path" (i.e. "added_sequence/assembly_provider/name"). No arrays can be processed. Only simple or "flattable" types.

If unversion is true
  • the unversioned synonym would be used to get the seq_region_id from "seq_region_map" if possible

Too close to the DB schema.

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
def add_sr_attribs(
    self,
    seq_region_file: str,
    seq_region_map: dict,
    attrib_type_map: dict,
    work_dir,
    unversion: bool = False,
):
    """
    Add seq_region_attrib(s) from the seq_region_file meta data file.

    Explicit list is taken from "sr_attrib_types" module param.

    Add seq_region_attrib(s) from the src/python/ensembl/io/genomio/data/schemas/seq_region.json compatible meta data file.
    Explicit list is taken from "sr_attrib_types" module param.

    "sr_attrib_types" defines { json_property -> attrib_type.name } map. If the value is dict,
    its keys are treated as "/"-delimited "json_path" (i.e. "added_sequence/assembly_provider/name").
    No arrays can be processed. Only simple or "flattable" types.

    If unversion is true:
      * the unversioned synonym would be used to get the seq_region_id from "seq_region_map" if possible

    Too close to the DB schema.
    """
    os.makedirs(work_dir, exist_ok=True)

    # return if there's nothing to add
    if not seq_region_file:
        return

    # technical / optimization. get atttib_type_id(s)
    # create a smaller map with attrib_type_id(s) as values
    properties_to_use = (
        []
    )  # [frozen]set with the top-level "seq_region" properties, that should be processed
    path_attrib_id_map = dict()  # { "flatterned/json/paths" : attrib_id_map ))}
    # fill set and map
    for prop, attrib_type in self.param("sr_attrib_types").items():
        # adding high level properties to process
        properties_to_use.append(prop)
        # adding json paths (or properties themselves) to atrrib_type_id map
        if isinstance(attrib_type, dict):  # if using json paths (delimeterd with "/")
            for path, inner_attrib_type in attrib_type.items():
                path_attrib_id_map[path] = self.id_from_map_or_die(
                    inner_attrib_type, attrib_type_map, "attrib_type_map"
                )
        else:
            path_attrib_id_map[prop] = self.id_from_map_or_die(
                attrib_type, attrib_type_map, "attrib_type_map"
            )
    # return if there's nothing to add
    if not properties_to_use:
        return

    properties_to_use = frozenset(properties_to_use)

    # load attributes from seq_region file
    attrib_trios = []  # [ (seq_region_id, attrib_id, value)... ] list of trios for inserting into db
    with open(seq_region_file) as in_file:
        seq_regions = list(json.load(in_file))
        for seq_region in seq_regions:
            # get seq_region_id (perhaps, by using unversioned name)
            seq_region_name, seq_region_id, unversioned_name = self.name_and_id_from_seq_region_item(
                seq_region, seq_region_map, try_unversion=unversion
            )

            # iterate through properties
            for prop_name in properties_to_use:
                if prop_name not in seq_region:
                    continue
                # flattern path
                path_attrib_id_values_list = self.flattern_seq_region_item(
                    seq_region, prop_name, path_attrib_id_map
                )
                # fill attrib_trios
                for path, attrib_id, value in path_attrib_id_values_list:
                    attrib_trios.append((seq_region_id, attrib_id, self.quote_or_null(value)))

    # run insertion SQL
    self.insert_to_db(
        attrib_trios,
        "seq_region_attrib",
        ["seq_region_id", "attrib_type_id", "value"],
        self.pjc(work_dir, "brc4_ebi_seq_region_synonyms"),
        ignore=True,
    )

add_sr_ebi_brc4_names(seq_region_file, seq_region_map, attrib_type_map, work_dir, unversion=False)

Add "(EBI|BRC4)_seq_region_name" seq_region_attrib(s) either from the seq_region_file meta data file, or from original seq_region names.

Add "(EBI|BRC4)_seq_region_name" seq_region_synonym from the src/python/ensembl/io/genomio/data/schemas/seq_region.json compatible meta data file or from the original seq_region_names. A special case of attributes adding with default values derived from seq_region names.

If unversion is true
  • the unversioned synonym would be used to get the seq_region_id from "seq_region_map" if possible

Too close to the DB schema.

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
def add_sr_ebi_brc4_names(
    self,
    seq_region_file: str,
    seq_region_map: dict,
    attrib_type_map: dict,
    work_dir: str,
    unversion: bool = False,
):
    """
    Add "(EBI|BRC4)_seq_region_name" seq_region_attrib(s) either from the seq_region_file meta data file, or from original seq_region names.

    Add "(EBI|BRC4)_seq_region_name" seq_region_synonym from the src/python/ensembl/io/genomio/data/schemas/seq_region.json compatible meta data file or from the original seq_region_names.
    A special case of attributes adding with default values derived from seq_region names.

    If unversion is true:
      * the unversioned synonym would be used to get the seq_region_id from "seq_region_map" if possible

    Too close to the DB schema.
    """
    os.makedirs(work_dir, exist_ok=True)

    # return if there's nothing to add
    if not seq_region_file:
        return

    # technical / optimization. get atttib_type_id(s) for "(EBI|BRC4)_seq_region_name"
    tagged_sr_name_attrib_id = {
        tag: self.id_from_map_or_die(f"{tag}_seq_region_name", attrib_type_map, "attrib_type_map")
        for tag in ["EBI", "BRC4"]
    }

    # load BRC4/EBI name from seq_region file
    brc4_ebi_name_attrib_trios = (
        []
    )  # [ (seq_region_id, attrib_id, value)... ] list of trios for inserting into db
    with open(seq_region_file) as in_file:
        seq_regions = list(json.load(in_file))
        for seq_region in seq_regions:
            # get seq_region_id (perhaps, by using unversioned name)
            seq_region_name, seq_region_id, unversioned_name = self.name_and_id_from_seq_region_item(
                seq_region, seq_region_map, try_unversion=unversion
            )
            # append attribs to the brc4_ebi_name_attrib_trios list
            for tag in ["BRC4", "EBI"]:
                attrib_name = f"{tag}_seq_region_name"
                attrib_id = tagged_sr_name_attrib_id[tag]
                value = seq_region.get(attrib_name, seq_region_name)
                brc4_ebi_name_attrib_trios.append((seq_region_id, attrib_id, self.quote_or_null(value)))

    # run insertion SQL
    self.insert_to_db(
        brc4_ebi_name_attrib_trios,
        "seq_region_attrib",
        ["seq_region_id", "attrib_type_id", "value"],
        self.pjc(work_dir, "brc4_ebi_seq_region_synonyms"),
        ignore=True,
    )

add_sr_synonyms(seq_region_file, seq_region_map, external_db_map, work_dir, unversion=False, unversionable_sources_set=frozenset(['INSDC', 'RefSeq']))

Add seq_region_synonym from the seq_region_file meta data file.

Add seq_region_synonym from the src/python/ensembl/io/genomio/data/schemas/seq_region.json compatible meta data file. Merge with the already existing ones in the db.

If unversion is true
  • the unversioned synonym would be used to get the seq_region_id from "seq_region_map" if possible
  • the unversioned synonyms from the unversionable_sources_set will be added as well as the original ones

Too close to the DB schema.

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
def add_sr_synonyms(
    self,
    seq_region_file: str,
    seq_region_map: dict,
    external_db_map: dict,
    work_dir: str,
    unversion: bool = False,
    unversionable_sources_set: frozenset = frozenset(["INSDC", "RefSeq"]),
):
    """
    Add seq_region_synonym from the seq_region_file meta data file.

    Add seq_region_synonym from the src/python/ensembl/io/genomio/data/schemas/seq_region.json compatible meta data file.
    Merge with the already existing ones in the db.

    If unversion is true:
      * the unversioned synonym would be used to get the seq_region_id from "seq_region_map" if possible
      * the unversioned synonyms from the unversionable_sources_set will be added as well as the original ones

    Too close to the DB schema.
    """
    os.makedirs(work_dir, exist_ok=True)

    # return if there's nothing to add
    if not seq_region_file:
        return

    # get seq_region ids, names, syns from db
    synonyms_trios_db = self.load_seq_region_synonyms_trios_from_core_db(
        self.pjc(work_dir, "syns_from_core")
    )

    # form set of synonyms already present in db
    synonyms_in_db = frozenset([trio[2] for trio in synonyms_trios_db if trio[2] != "NULL"])

    # subset of sources to use the allowed the unversion synonyms
    unversionable_sources = unversion and unversionable_sources_set or frozenset()

    # technical / optimization. get external_db_id for "ensembl_internal_synonym"
    ensembl_internal_synonym_ext_db_id = self.id_from_map_or_die(
        "ensembl_internal_synonym", external_db_map, "external_db_map"
    )

    # get dict for additional mapping for sources to external_db names if there's one specified by "external_db_map" module param
    #   not to be confused with the external_db_map function parameter above
    additional_sources_mapping = self.get_external_db_mapping()

    # load synonyms from the json file
    synonyms_from_json = (
        []
    )  # [ (seq_region_id, synonym, external_db_id)... ] list of trios for inserting into db
    with open(seq_region_file) as in_file:
        seq_regions = list(json.load(in_file))
        for seq_region in filter(lambda sr: sr.get("synonyms", False), seq_regions):
            # iterate through all seq_regions having "synonyms"
            seq_region_name, seq_region_id, _ = self.name_and_id_from_seq_region_item(
                seq_region, seq_region_map, try_unversion=unversion
            )

            # fill synonyms_from_json list of trios
            for synonym_item in list(seq_region["synonyms"]):
                synonym_name = synonym_item["name"]
                source = synonym_item["source"]
                unversioned_name = ""

                # check if there's any addtional mapping for the source, remap if so
                if additional_sources_mapping:
                    source = additional_sources_mapping.get(
                        source, source
                    )  # use the same name if no matches in additional_sources_mapping dict

                # try to get unversioned name if applicable
                if source in unversionable_sources:
                    unversioned_name = re.sub(r"\.\d+$", "", synonym_name)

                # put trios if names are not already seen in db
                if synonym_name not in synonyms_in_db:
                    external_db_id = self.id_from_map_or_die(source, external_db_map, "external_db_map")
                    synonyms_from_json.append(
                        (seq_region_id, self.quote_or_null(synonym_name), external_db_id)
                    )

                #   put additional unversioned synonyms if there's a sane one
                if (
                    unversioned_name
                    and unversioned_name != synonym_name
                    and unversioned_name not in synonyms_in_db
                ):
                    synonyms_from_json.append(
                        (
                            seq_region_id,
                            self.quote_or_null(unversioned_name),
                            ensembl_internal_synonym_ext_db_id,
                        )
                    )

    # run insertion SQL
    self.insert_to_db(
        synonyms_from_json,
        "seq_region_synonym",
        ["seq_region_id", "synonym", "external_db_id"],
        self.pjc(work_dir, "new_seq_region_synonyms"),
        ignore=True,
    )

agp_prune(from_file, to_file, used=None)

Remove already components from the AGP file if they are seen in "used" set

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
def agp_prune(self, from_file: str, to_file: str, used: set = None):
    """
    Remove already components from the AGP file if they are seen in "used" set
    """
    # reomve used component
    #   and GAPS as they are not used by 'ensembl-analysis/scripts/assembly_loading/load_agp.pl'
    os.makedirs(dirname(to_file), exist_ok=True)
    open_ = self.is_gz(from_file) and gzip.open or open
    if used is None:
        cmd = r"""{_cat} {_file} > {_out}""".format(
            _cat=self.is_gz(from_file) and "zcat" or "cat", _file=from_file, _out=to_file
        )
        print("running %s" % (cmd), file=sys.stderr)
        sp.run(cmd, shell=True, check=True)
        return 1
    writes = 0
    with open_(from_file, "r") as src:
        with open(to_file, "w") as dst:
            for line in src:
                fields = line.strip().split("\t")
                (
                    asm_id,
                    asm_start,
                    asm_end,
                    asm_part,
                    type_,
                    cmp_id,
                    cmp_start,
                    cmp_end,
                    cmp_strand,
                ) = fields
                if type_ in "NU" or cmp_id in used:
                    continue
                used.add(cmp_id)
                print(line.strip(), file=dst)
                writes += 1
    return writes

asm_name()

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1125
1126
1127
1128
1129
def asm_name(self):
    asm = self.from_param("genome_data", "assembly")
    if "name" not in asm:
        raise Exception("no assembly/name in genome_data")
    return asm["name"]

chunk_contigs(fasta, cs_ranks, agps, work_dir, chunk_size=0, chunks_cs_name='ensembl_internal')

chunk dna sequence fasta no chunking if chunk_size < 50k

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
def chunk_contigs(self, fasta, cs_ranks, agps, work_dir, chunk_size=0, chunks_cs_name="ensembl_internal"):
    """
    chunk dna sequence fasta
      no chunking if chunk_size < 50k
    """
    chunk_size_min_len = self.param_required("sequence_data_chunk_min_len")
    if chunk_size < chunk_size_min_len:
        return fasta, cs_ranks, agps

    # split using script
    os.makedirs(work_dir, exist_ok=True)
    _stderr = f"{work_dir}/chunking.stderr"
    _out_agp = f"{work_dir}/chunks.agp"
    _out_fasta = f"{work_dir}/chunks.fasta"

    _splitter = r"fasta_chunk"
    split_cmd = f"{_splitter} --chunk_size {chunk_size} --agp_out {_out_agp} --out {_out_fasta} --fasta_dna {fasta} 2> {_stderr}"

    print(f"running {split_cmd}", file=sys.stderr)
    # NB throws CalledProcessError if failed
    sp.run(split_cmd, shell=True, check=True)

    # add rank for chunks
    _cs_name, _cs_rank = sorted(cs_ranks.items(), key=lambda k: k[1])[-1]
    cs_ranks[chunks_cs_name] = _cs_rank + 1

    # add agps entry
    if agps is None:
        agps = dict()
    agps[f"{_cs_name}-{chunks_cs_name}"] = _out_agp

    return _out_fasta, cs_ranks, agps

coord_sys_order(cs_order_str)

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
880
881
882
def coord_sys_order(self, cs_order_str):
    cs_order_lst = map(lambda x: x.strip(), cs_order_str.split(","))
    return {e: i for i, e in enumerate(filter(lambda x: len(x) > 0, cs_order_lst))}

copy_sr_name_to_syn(cs, x_db, log_pfx)

Store original seq_region names as seq_region_synonym

Store original seq_region names (from a given cood_systen, "cs" param) as seq_region_synonyms (using "x_db" external source name) SQL code.

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
def copy_sr_name_to_syn(self, cs, x_db, log_pfx):
    """
    Store original seq_region names as seq_region_synonym

    Store original seq_region names (from a given cood_systen, "cs" param) as seq_region_synonyms (using "x_db" external source name)
    SQL code.
    """
    asm_v = self.asm_name()
    sql = r"""insert into seq_region_synonym (seq_region_id, synonym, external_db_id)
              select
                  sr.seq_region_id, sr.name, xdb.external_db_id
              from
                 seq_region sr, external_db xdb, coord_system cs
              where   xdb.db_name = "%s"
                  and sr.coord_system_id = cs.coord_system_id
                  and cs.name = "%s"
                  and cs.version = "%s"
                  and sr.name like "%%._"
            ;""" % (
        x_db,
        cs,
        asm_v,
    )
    return self.run_sql_req(sql, log_pfx)

db_string()

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1098
1099
1100
1101
1102
1103
1104
1105
def db_string(self):
    return "-dbhost {host_} -dbport {port_} -dbuser {user_} -dbpass {pass_} -dbname {dbname_} ".format(
        host_=self.param("dbsrv_host"),
        port_=self.param("dbsrv_port"),
        user_=self.param("dbsrv_user"),
        pass_=self.param("dbsrv_pass"),
        dbname_=self.param("db_name"),
    )

filter_already_loaded_regions_from_agp(src_file, dst_file, loaded_regions, new_regions)

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
def filter_already_loaded_regions_from_agp(self, src_file, dst_file, loaded_regions, new_regions):
    with open(src_file) as src:
        with open(dst_file, "w") as dst:
            for line in src:
                fields = line.strip().split("\t")
                (
                    asm_id,
                    asm_start,
                    asm_end,
                    asm_part,
                    type_,
                    cmp_id,
                    cmp_start,
                    cmp_end,
                    cmp_strand,
                ) = fields
                if type_ in "NU" or asm_id in loaded_regions:
                    continue
                new_regions.add(asm_id)
                print(line.strip(), file=dst)

flattern_seq_region_item(seq_region, prop_name, path_attrib_id_map, sep='/')

Flattern seq_region[property] and store corresponding [ (json_path, attrib_id, value)... ] (as list of trios).

Only works for simple properties or dicts with no arrays on the path. Basically, implemets tree traversal. Utility function used by the add_sr_attribs method

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
def flattern_seq_region_item(
    self, seq_region: dict, prop_name: str, path_attrib_id_map: dict, sep: str = "/"
) -> list:
    """
    Flattern seq_region[property] and store corresponding [ (json_path, attrib_id, value)... ] (as list of trios).

    Only works for simple properties or dicts with no arrays on the path. Basically, implemets tree traversal.
    Utility function used by the `add_sr_attribs` method
    """
    res = []
    # is there anything to do
    if prop_name not in seq_region:
        return res

    # set up
    value = seq_region[prop_name]
    paths_to_go = [
        (prop_name, value)
    ]  # storing path and the corresponding value, to prevent repetetive traversals
    # iterate
    while paths_to_go:
        (path, value) = paths_to_go.pop()  # get last item
        if isinstance(value, list):
            # perhaps, it's better to raise exception then to continue silently
            continue
        if isinstance(value, dict):
            # if value is a complex object, add its leaves
            for key, val in value.items():
                paths_to_go.append((f"{path}{sep}{key}", val))
            continue
        # if value is simple
        attrib_id = path_attrib_id_map.get(path, None)
        if attrib_id:
            res.append((path, attrib_id, value))
    # return what ever we have
    return res

from_param(param, key, not_throw=False)

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1132
1133
1134
1135
1136
1137
1138
1139
def from_param(self, param, key, not_throw=False):
    data = self.param_required(param)
    if key not in data:
        if not_throw:
            return None
        else:
            raise Exception("Missing required %s data: %s" % (param, key))
    return data[key]

get_external_db_mapping()

Get a map from a file for external_dbs to Ensembl dbnames from "external_db_map" module(!) param

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
def get_external_db_mapping(self) -> dict:
    """
    Get a map from a file for external_dbs to Ensembl dbnames from "external_db_map" module(!) param
    """
    external_map_path = self.param("external_db_map")
    db_map = dict()
    if external_map_path is None:
        return db_map

    # Load the map
    with open(external_map_path, "r") as map_file:
        for line in map_file:
            if line.startswith("#"):
                continue
            line = re.sub(r"#.*", "", line)
            if re.match(r"^\s*$", line):
                continue
            (from_name, to_name, *rest) = line.strip().split("\t")
            if len(rest) > 0 and rest[0].upper() != "SEQ_REGION":
                continue
            if to_name == "_IGNORE_":
                continue
            db_map[from_name] = to_name
    return db_map

get_toplevel_from_cs(coord_system_name, work_dir)

Returns list of [ (seq_region_name, seq_region_id, "") ] trios for toplevel seq_regions from coord system with coord_system_name or having "coord_system_tag" attribute with the coord_system_name value

SQL code

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
def get_toplevel_from_cs(self, coord_system_name, work_dir) -> list:
    """
    Returns list of [ (seq_region_name, seq_region_id, "") ] trios for toplevel seq_regions from coord system with `coord_system_name`
      or having  "coord_system_tag" attribute with the `coord_system_name` value

    SQL code
    """
    out_pfx = self.pjc(work_dir, f"toplevel_from_{coord_system_name}")
    sql = f"""SELECT DISTINCT sr.name, sr.seq_region_id
            FROM seq_region sr,
                 seq_region_attrib sra,
                 coord_system cs,
                 attrib_type at
            WHERE sr.seq_region_id = sra.seq_region_id
              AND sr.coord_system_id = cs.coord_system_id
              AND sra.attrib_type_id = at.attrib_type_id
              AND (  ( cs.name = "{coord_system_name}" and at.code = "toplevel" )
                  OR ( at.code = "coord_system_tag" and sra.value = "{coord_system_name}" )
                  )
              ORDER BY sr.seq_region_id;
           """

    res = self.run_sql_req(sql, out_pfx)

    sr_trios = []
    out_file = out_pfx + ".stdout"
    with open(out_file) as sr_file:
        skip_header = True
        for line in sr_file:
            if skip_header:
                skip_header = False
                continue
            (name, sr_id) = line.strip().split("\t")
            sr_trios.append((name, sr_id, ""))

    return sr_trios

id_from_map_or_die(key, map_dict, name_for_panic)

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1191
1192
1193
1194
1195
def id_from_map_or_die(self, key: str, map_dict: dict, name_for_panic):
    value = map_dict.get(key, None)
    if value is None:
        raise Exception(f"no such key '{key}' in '{name_for_panic}' map")
    return value

initial_sequence_loading(work_dir)

initial preparation and loading of AGPs and fasta data.

initial preparation and loading of AGPs and fasta data using ensembl-analysis perl scripts

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
def initial_sequence_loading(self, work_dir: str):
    """
    initial preparation and loading of AGPs and fasta data.

    initial preparation and loading of AGPs and fasta data using ensembl-analysis perl scripts
    """
    # preprocess FASTA with sequences
    #   rename IUPAC to N symbols using sed
    fasta_clean = self.from_param("manifest_data", "fasta_dna")

    # start coord system ranking and agps processing
    agps = self.from_param("manifest_data", "agp", not_throw=True)

    # get the deafult coord_system order
    #   use noagp_cs_name_default for "noagp" assemblies
    cs_order = self.coord_sys_order(self.param("cs_order"))
    noagps_cs = self.param("noagp_cs_name_default")

    # remove gaps and lower_level mappings if the are coveres by higher level ones
    #   i.e.: remove 'contigN to chromosomeZ', if 'contigN to scaffoldM' and 'scaffoldM to chromosomeZ' are in place
    #   returns None if no agps provided
    agps_pruned_dir = self.pjc(work_dir, "agps_pruned")
    agps_pruned = self.prune_agps(agps, cs_order, agps_pruned_dir, self.param_bool("prune_agp"))

    # order
    # rank cs_names, met in agps.keys ("-" separated, i.e. "scaffold-contig") based on cs_order
    cs_rank = self.used_cs_ranks(agps_pruned, cs_order, noagps_cs)

    # chunk sequence data if needed
    #   no chunking if chunk_size < 50k
    chunk_size = int(self.param("sequence_data_chunk"))
    chunk_cs_name = self.param("chunk_cs_name")
    fasta_clean, cs_rank, agps_pruned = self.chunk_contigs(
        fasta_clean,
        cs_rank,
        agps_pruned,
        pj(work_dir, "chunking"),
        chunk_size=chunk_size,
        chunks_cs_name=chunk_cs_name,
    )

    # empty agps_pruned ignored
    self.load_seq_data(fasta_clean, agps_pruned, cs_rank, self.pjc(work_dir, "load"))

    # mark all the "contig"s or noagp_cs as being sourced from ENA
    if not self.param_bool("no_contig_ena_attrib"):
        # NB using original "agps" parameter (with no chunking data added)
        agps_raw = self.from_param("manifest_data", "agp", not_throw=True)
        if agps_raw is None:
            self.add_contig_ena_attrib(self.pjc(work_dir, "load", "set_ena"), cs_name=noagps_cs)
        else:
            self.add_contig_ena_attrib(self.pjc(work_dir, "load", "set_ena"))

    # unversion scaffold, remove ".\d$" from names if there's a need
    if self.param_bool("unversion_scaffolds"):
        self.unversion_scaffolds(cs_rank, self.pjc(work_dir, "unversion_scaffolds"))

    # add assembly mappings between various cs to meta table for the mapper to work properly
    cs_pairs = agps_pruned and agps_pruned.keys() or None
    self.add_asm_mappings(cs_pairs, self.pjc(work_dir, "asm_mappings"))

    # set toplevel seq_region attribute
    self.set_toplevel(self.pjc(work_dir, "set_toplevel"), self.param("not_toplevel_cs"))

    # nullify contig version and update mappings strings accordingly; ignore for "load_additional_sequences" mode
    if not self.param_bool("load_additional_sequences"):
        self.nullify_ctg_cs_version(cs_order, self.pjc(work_dir, "asm_mapping", "nullify_cs_versions"))

insert_to_db(list_of_tuples, table_name, col_names, work_dir, ignore=True)

Insert into the core db's {table_name} tuples from {list_of_tuples} as col_names.

Use quote_or_null (see definition below) method for string values, when putting values into list_of_tuples SQL code

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
def insert_to_db(
    self, list_of_tuples: list, table_name: str, col_names: list, work_dir: str, ignore: bool = True
):
    """
    Insert into the core db's {table_name} tuples from {list_of_tuples} as col_names.

    Use `quote_or_null` (see definition below) method for string values, when putting values into `list_of_tuples`
    SQL code
    """
    # return if nothing to do
    if not list_of_tuples:
        return

    # prepare request parts
    ignore_str = ignore and "IGNORE" or ""
    cols_str = ", ".join(col_names)

    # generate file with the insert SQL command
    insert_sql_file = self.pjc(work_dir, "insert.sql")
    with open(insert_sql_file, "w") as sql:
        print(f"INSERT {ignore_str} INTO {table_name} ({cols_str}) VALUES", file=sql)
        values_sep = ""
        for tpl in list_of_tuples:
            tpl_str = ", ".join(map(str, tpl))
            print(f"{values_sep}({tpl_str})", file=sql)
            values_sep = ", "
        print(";", file=sql)

    # run insert SQL from file
    self.run_sql_req(insert_sql_file, self.pjc(work_dir, "insert"), from_file=True)

is_gz(filename)

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1122
1123
def is_gz(self, filename):
    return filename.endswith(".gz")

load_agp(pair, asm_v, src_file, log_pfx)

ensembl script (load_agp.pl) based utility for loading seq_regions assembly data (AGPs)

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
def load_agp(self, pair, asm_v, src_file, log_pfx):
    """ensembl script (load_agp.pl) based utility for loading seq_regions assembly data (AGPs)"""
    en_root = self.param_required("ensembl_root_dir")
    (asm_n, cmp_n) = pair.strip().split("-")
    cmd = (
        r"""{_loader} {_db_string} -assembled_version {_asm_v} """
        + r"""    -assembled_name {_asm} -component_name {_cmp} """
        + r"""    -agp_file {_file} """
        + r"""    > {_log}.stdout 2> {_log}.stderr"""
    ).format(
        _loader="perl %s" % (self.pjc(en_root, r"ensembl-analysis/scripts/assembly_loading/load_agp.pl")),
        _db_string=self.db_string(),
        _asm_v=asm_v,
        _asm=asm_n,
        _cmp=cmp_n,
        _file=src_file,
        _log="%s_agp_%s" % (log_pfx, pair.replace("-", "_")),
    )
    print("running %s" % (cmd), file=sys.stderr)
    return sp.run(cmd, shell=True, check=True)

load_cs_data(cs, rank, pair, asm_v, src_file, log_pfx, loaded_regions=None, seq_level=False)

creates a coord_system and loads sequence or assembly(AGP) data for corresponding seqregions

doesn't load already seen sequences

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
def load_cs_data(self, cs, rank, pair, asm_v, src_file, log_pfx, loaded_regions=None, seq_level=False):
    """creates a coord_system and loads sequence or assembly(AGP) data for corresponding seqregions

    doesn't load already seen sequences
    """
    # NB load_seq_region.pl and load_agp.pl are not failing on parameter errors (0 exit code)
    os.makedirs(dirname(log_pfx), exist_ok=True)
    additional_load = self.param_bool("load_additional_sequences")
    if seq_level:
        self.load_seq_region(cs, rank, asm_v, src_file, log_pfx, seq_level, additional_load)
    elif loaded_regions is not None:
        new_regions = set()
        clean_file = src_file + ".regions_deduped"
        self.filter_already_loaded_regions_from_agp(src_file, clean_file, loaded_regions, new_regions)
        self.load_seq_region(cs, rank, asm_v, clean_file, log_pfx, seq_level, additional_load)
        loaded_regions.update(new_regions)
    if not seq_level:
        self.load_agp(pair, asm_v, src_file, log_pfx)

load_map_from_core_db(table, cols, work_dir)

Load 2 "cols" from core db "table" as map

Load { cols[0] : cols[1] } map from the core db "table" SQL code

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
def load_map_from_core_db(self, table, cols, work_dir) -> dict:
    """
    Load 2 "cols" from core db "table" as map

    Load { cols[0] : cols[1] } map from the core db "table"
    SQL code
    """
    out_pfx = self.pjc(work_dir, f"{table}_map")
    sql = f"""select {cols[0]}, {cols[1]} FROM {table};"""
    res = self.run_sql_req(sql, out_pfx)

    out_file = out_pfx + ".stdout"
    data = self.load_map_from_sql_stdout(out_file, skip_header=True)
    if not data:
        raise Exception(f"No '{table}' map loaded from '{out_file}'")
    return data

load_map_from_sql_stdout(in_file, skip_header=False)

Load map from the SQL output

Process input in_file with "key value" pairs and load then into the {key : value} map. Skips header if skip_header.

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
def load_map_from_sql_stdout(self, in_file, skip_header=False):
    """
    Load map from the SQL output

    Process input in_file with "key  value" pairs and load then
    into the {key : value} map.
    Skips header if skip_header.
    """
    data = dict()
    with open(in_file) as pairs_file:
        for line in pairs_file:
            if skip_header:
                skip_header = False
                continue
            (key, val) = line.strip().split("\t")
            data[key] = val
    return data

load_seq_data(fasta, agps, cs_rank, log_pfx)

loads sequence data for various coordinate systems accordingly with their rank

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
def load_seq_data(self, fasta, agps, cs_rank, log_pfx):
    """loads sequence data for various coordinate systems accordingly with their rank"""
    asm_v = self.asm_name()

    sequence_rank = max(cs_rank.values())
    for cs, rank in sorted(cs_rank.items(), key=lambda p: -p[1]):
        logs = self.pjc(log_pfx, "%02d_%s" % (rank, cs))
        if rank == sequence_rank:
            self.load_cs_data(cs, rank, "fasta", asm_v, fasta, logs, loaded_regions=None, seq_level=True)
        else:
            useful_agps = list(filter(lambda x: cs in x, agps and agps.keys() or []))
            if len(useful_agps) == 0:
                raise Exception("non-seq_level cs %s has no agps to assemble it from" % (cs))
            loaded_regions = set()
            for pair, agp_file_pruned in map(lambda k: (k, agps[k]), useful_agps):
                if not pair.startswith(cs + "-"):
                    continue
                self.load_cs_data(cs, rank, pair, asm_v, agp_file_pruned, logs, loaded_regions)

load_seq_region(cs, rank, asm_v, src_file, log_pfx, seq_level=False, additional_load=False)

ensembl-analysis script (load_seq_region.pl) based utility for loading seq_regions FASTA sequences

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
def load_seq_region(
    self,
    cs: str,
    rank: str,
    asm_v: str,
    src_file: str,
    log_pfx: str,
    seq_level=False,
    additional_load=False,
):
    """ensembl-analysis script (load_seq_region.pl) based utility for loading seq_regions FASTA sequences"""
    en_root = self.param_required("ensembl_root_dir")
    cmd = (
        r"""{_loader} {_db_string} {_asm_v_flag} -default_version -ignore_ambiguous_bases """
        + r"""    -rank {_rank} -coord_system_name {_cs} {_sl_flag} -{_tag}_file {_file}"""
        + r"""     > {_log}.stdout 2> {_log}.stderr"""
    ).format(
        _loader="perl %s"
        % (self.pjc(en_root, r"ensembl-analysis/scripts/assembly_loading/load_seq_region.pl")),
        _db_string=self.db_string(),
        _asm_v_flag=not additional_load and f"-coord_system_version {asm_v}" or "",
        _rank=rank,
        _cs=cs,
        _sl_flag=seq_level and "-sequence_level" or "",
        _tag=seq_level and "fasta" or "agp",
        _file=src_file,
        _log="%s_seq" % (log_pfx),
    )
    print("running %s" % (cmd), file=sys.stderr)
    return sp.run(cmd, shell=True, check=True)

load_seq_region_synonyms_trios_from_core_db(work_dir)

Load seq_region_synonyms from from core db into [(seq_region_id, name, synonym)...] list

SQL code

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
def load_seq_region_synonyms_trios_from_core_db(self, work_dir: str) -> list:
    # was get_db_syns
    """
    Load seq_region_synonyms from from core db into [(seq_region_id, name, synonym)...] list

    SQL code
    """
    out_pfx = self.pjc(work_dir, f"seq_region_synonyms")
    sql = r"""select sr.seq_region_id as seq_region_id, sr.name, srs.synonym
             from seq_region sr left join seq_region_synonym srs
             on sr.seq_region_id = srs.seq_region_id
             order by sr.seq_region_id
          ;"""

    res = self.run_sql_req(sql, out_pfx)

    syn_trios = []
    out_file = out_pfx + ".stdout"
    with open(out_file) as syns_file:
        skip_header = True
        for line in syns_file:
            if skip_header:
                skip_header = False
                continue
            (sr_id, name, syn) = line.strip().split("\t")
            syn_trios.append((sr_id, name, syn))
    return syn_trios

name_and_id_from_seq_region_item(seq_region_item, seq_region_map, try_unversion=False, throw_missing=True)

Get (seq_region_name, seq_region_id, unversioned_name) from seq_region_item struct(dict)

Gets unversioned_name only if "try_unversion" is True. Throws exception if not able to get seq_region_id from "seq_region_map" and "throw_missing" is true.

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
def name_and_id_from_seq_region_item(
    self,
    seq_region_item: dict,
    seq_region_map: dict,
    try_unversion: bool = False,
    throw_missing: bool = True,
) -> (str, str, str):
    """
    Get (seq_region_name, seq_region_id, unversioned_name) from seq_region_item struct(dict)

    Gets unversioned_name only if "try_unversion" is True.
    Throws exception if not able to get seq_region_id from "seq_region_map" and "throw_missing" is true.
    """
    #   get seq_region_id (perhaps, by using unversioned name)
    seq_region_name = seq_region_item["name"]
    seq_region_id = seq_region_map.get(seq_region_name, None)
    unversioned_name = None
    if seq_region_id is None and try_unversion:
        # try to get seq_region_id for the unversioned name
        unversioned_name = re.sub(r"\.\d+$", "", seq_region_name)
        seq_region_id = seq_region_map.get(unversioned_name, "")

    # oops, we don't know such seq_region name
    if not seq_region_id and throw_missing:
        raise Exception(f"Not able to find seq_region for '{seq_region_name}'")

    return (seq_region_name, seq_region_id, unversioned_name)

nullify_ctg_cs_version(cs_order, log_pfx)

Nullify every CS version with rank larger than that of "contig", but don't nullify toplevel ones.

SQL code

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
def nullify_ctg_cs_version(self, cs_order, log_pfx: str):
    """
    Nullify every CS version with rank larger than that of "contig", but don't nullify toplevel ones.

    SQL code
    """
    asm_v = self.asm_name()
    # get cs_info (and if they have toplevel regions)
    sql = r"""select cs.coord_system_id as coord_system_id,
                     cs.name, cs.rank, (tl.coord_system_id is NULL) as no_toplevel
                from coord_system cs
                  left join (
                    select distinct sr.coord_system_id
                      from seq_region sr, seq_region_attrib sra, attrib_type at
                      where at.code = "toplevel"
                        and sra.attrib_type_id = at.attrib_type_id
                        and sra.value = 1
                        and sra.seq_region_id = sr.seq_region_id
                  ) as tl on tl.coord_system_id = cs.coord_system_id
                where cs.version = "{_asm_v}"
                order by rank
          ;""".format(
        _asm_v=asm_v
    )
    # run_sql
    toplvl_pfx = self.pjc(log_pfx, "toplvl_info")
    self.run_sql_req(sql, toplvl_pfx)
    # load info
    cs_info = []
    with open(toplvl_pfx + ".stdout") as f:
        header = None
        for line in f:
            if header is None:
                header = line.strip().split("\t")
                continue
            cs_info.append(dict(zip(header, line.strip().split())))
    # return if there's no coord_systems to nullify versions for
    if not cs_info:
        return
    # get list of known cs from cs_order to clean version from
    nullify_cs_version_from = self.param("nullify_cs_version_from")
    if not nullify_cs_version_from or nullify_cs_version_from not in cs_order:
        return
    cs_thr_index = cs_order[nullify_cs_version_from]
    cs_names_to_keep_ver = frozenset([nm for (nm, ind) in cs_order.items() if ind > cs_thr_index])

    # choose cs rank threshold to start clearing version from
    clear_lst = [
        (cs["coord_system_id"], cs["name"])
        for cs in cs_info
        if (bool(int(cs["no_toplevel"])) and cs["name"] not in cs_names_to_keep_ver)
    ]

    # run sql
    if clear_lst:
        clear_pfx = self.pjc(log_pfx, "clear")
        with open(clear_pfx + ".sql", "w") as clear_sql:
            for cs_id, cs_name in clear_lst:
                sql = r"""
                    update meta set
                        meta_value=replace(meta_value, "|{_cs_name}:{_asm_v}", "|{_cs_name}")
                        where meta_key="assembly.mapping";
                    update coord_system set version = NULL where coord_system_id = {_cs_id};
                """.format(
                    _asm_v=asm_v, _cs_name=cs_name, _cs_id=cs_id
                )
                print(sql, file=clear_sql)
        self.run_sql_req(clear_pfx + ".sql", clear_pfx, from_file=True)

order_agp_levels(agps, cs_order)

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
def order_agp_levels(self, agps, cs_order):
    # sort agp for loading by:
    #   highest component (cmp) cs level (lowest rank)
    #   lowest difference between cs ranks (asm - cmp)
    #   i.e: chromosome-scaffold scaffold-chunk chromosome-chunk
    if not agps:
        return []

    agp_cs_pairs = list(map(lambda x: [x] + x.split("-"), agps.keys()))
    agp_levels = [(x[0], cs_order[x[1]], cs_order[x[2]]) for x in agp_cs_pairs]

    bad_agps = list(filter(lambda x: x[1] < x[2], agp_levels))
    if len(bad_agps) > 0:
        raise Exception("component cs has higher order than assembled cs %s" % (str(bad_agps)))

    agp_levels_sorted = [e[0] for e in sorted(agp_levels, key=lambda x: (-x[2], x[1] - x[2]))]
    return agp_levels_sorted

param_bool(param)

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1141
1142
1143
def param_bool(self, param):
    val = self.param(param)
    return bool(val) and "0" != val

param_defaults()

default parameter/options values

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def param_defaults(self):
    """
    default parameter/options values
    """
    return {
        # relative order of the coord_systems types to infer their rank from
        "cs_order": "ensembl_internal,chunk,contig,supercontig,non_ref_scaffold,scaffold,primary_assembly,superscaffold,linkage_group,chromosome",
        "artificial_cs": "ensembl_internal,chunk",  # coord_systems to ignore when adding synonyms for sequence_level cs
        "IUPAC": "RYKMSWBDHV",  # symbols to be replaced with N in the DNA sequences (ensembl core(107) doesn't support the whole IUPAC alphabet for DNA)
        # unversion scaffold, remove ".\d$" from seq_region.names if there's a need
        "unversion_scaffolds": 0,
        "versioned_sr_syn_src": "INSDC",  # INSDC(50710) # if unversioning non-sequence level cs, store original name (with version) as this synonym
        "sr_syn_src": "BRC4_Community_Symbol",  # BRC4_Community_Symbol(211) # if unversioning sequence-level cs, store original name (with version) as this synonym
        # nullify coord_system version for the given coord_system name
        "nullify_cs_version_from": "contig",
        # default coord_system name for single-level (no AGPs assemblies)
        "noagp_cs_name_default": "primary_assembly",
        # file for additional mapping of the synonym sources to the external_db (ensembl), as used by "get_external_db_mapping" function below
        "external_db_map": None,
        # set "coord_system_tag" seq_region attribute to this value if there's a corresponding "chromosome_display_order" list in genome.json metadata
        #   if None, only "chromosome" coord system is processed (if present)
        #   (see add_chr_karyotype_rank definition below )
        #   if seq_region already has `coord_system_tagi` attribute, it value updated only if "force_update_coord_system_tag" module param is True (see below)
        "cs_tag_for_ordered": None,
        # Force updating of the "coord_system_tags" attribute for seq_regions from `cs_tag_for_ordered` (see above)
        "force_update_coord_system_tag": False,
        # BRC4 compatibility mode; if on, "(EBI|BRC4)_seq_region_name" seq_region_attributes are added.
        #   Blocked by the "swap_gcf_gca" option. In this case insertion should be done on later pipeline stage after seq_region name swapping.
        "brc4_mode": True,
        # Whether to use RefSeq names as additional seq_region synonyms (if available) or not (see add_sr_synonyms definition below)
        #  Does not swap anything actually, just loads synonyms to be used by a later "swapping" stage
        #  Disables BRC4 compatibilty mode (see the "brc4_mode" option comment).
        "swap_gcf_gca": False,
        # list of coord systems used in "-ignore_coord_system" options of the "ensembl-analysis/scripts/assembly_loading/set_toplevel.pl" script
        #   part of the loading process
        "not_toplevel_cs": [],  # i.e. "contig", "non_ref_scaffold"
        # explicit list of seq_region properties (keys) to load as seq_region_attrib s (values) (see add_sr_attribs definition below)
        #   if a dict's used as a value, treat its keys as "json_path" (/ as delim) map, i.e.
        #       { "added_sequence" : { "assembly_provider" : { "name" : ... } } } -> "added_sequence/assembly_provider/name"
        #   only flattable properties can be used, no arrays
        #   arrays should be processed separately (see `add_sr_synonyms` or `add_karyotype_bands` definitions)
        # see src/python/ensembl/io/genomio/data/schemas/seq_region.json
        "sr_attrib_types": {
            "circular": "circular_seq",
            "codon_table": "codon_table",
            "location": "sequence_location",
            "non_ref": "non_ref",
            "coord_system_level": "coord_system_tag",
            "added_sequence": {
                # json_path to attrib_type_code(str) mapping
                "added_sequence/accession": "added_seq_accession",
                "added_sequence/assembly_provider/name": "added_seq_asm_pr_nam",
                "added_sequence/assembly_provider/url": "added_seq_asm_pr_url",
                "added_sequence/annotation_provider/name": "added_seq_ann_pr_nam",
                "added_sequence/annotation_provider/url": "added_seq_ann_pr_url",
            },  # added_sequence
        },
        # loading additional sequences to the already exsisting core db
        "load_additional_sequences": 0,
        # size of the sequence data chunk, if 0 (default), no chunking is performed
        "sequence_data_chunk": 0,
        #   min size of the sequence chunk, no chunking is done if 'sequence_data_chunk' < 'sequence_data_chunk_min'
        "sequence_data_chunk_min_len": 50_000,
        # coord system name for chunks
        "chunk_cs_name": "ensembl_internal",
    }

pjc(*parts)

Join path parts and try to create every directory but the last one.

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
def pjc(self, *parts: list) -> str:
    """
    Join path parts and try to create every directory but the last one.
    """
    if not parts:
        return None

    parts = list(parts)
    last = parts.pop()
    prefix = pj(*parts)

    os.makedirs(prefix, exist_ok=True)

    return pj(prefix, last)

prune_agps(agps, cs_order, agps_pruned_dir, pruning=True)

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
def prune_agps(self, agps, cs_order, agps_pruned_dir, pruning=True):
    # when loading agp sort by:
    #   highest component (cmp) cs level (lowest rank)
    #   lowest difference between cs ranks (asm - cmp)
    #   i.e: chromosome-scaffold scaffold-chunk chromosome-chunk
    #   if no agps return empty pruned result

    if not agps:
        return None

    agp_levels_sorted = self.order_agp_levels(agps, cs_order)

    # prune agps
    agps_pruned = dict()
    used_components = set()
    if not pruning:
        used_components = None
    for asm_cmp in agp_levels_sorted:
        agp_file_src = agps[asm_cmp]
        agp_file_dst = self.pjc(agps_pruned_dir, asm_cmp + ".agp")
        if self.agp_prune(agp_file_src, agp_file_dst, used_components) > 0:
            agps_pruned[asm_cmp] = agp_file_dst
    return agps_pruned

quote_or_null(val, quotes="'", null='NULL', strings_only=True)

Return val wrapped in quotes or null value

Quotes only strings (instances of str) if strings_only is True.

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
def quote_or_null(self, val: str, quotes: str = "'", null: str = "NULL", strings_only=True) -> str:
    """
    Return `val` wrapped in `quotes` or `null` value

    Quotes only strings (instances of `str`) if strings_only is True.
    """
    if val is None:
        return null
    if strings_only and isinstance(val, str):
        return f"{quotes}{val}{quotes}"
    return val

remove_IUPAC(from_file, to_file)

remove non-valid symbols from FASTA file (using sed) ans store the result in a different location

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
def remove_IUPAC(self, from_file: str, to_file: str):
    """remove non-valid symbols from FASTA file (using sed) ans store the result in a different location"""
    IUPAC = self.param("IUPAC")
    os.makedirs(dirname(to_file), exist_ok=True)
    cmd = r"""{_cat} {_file} | sed -r '/^[^>]/ {{ s/[{_IUPAC}]/N/g; s/{_iupac}/n/g }}' > {_out}""".format(
        _cat=self.is_gz(from_file) and "zcat" or "cat",
        _file=from_file,
        _IUPAC=IUPAC.upper(),
        _iupac=IUPAC.lower(),
        _out=to_file,
    )
    print("running %s" % (cmd), file=sys.stderr)
    return sp.run(cmd, shell=True, check=True)

remove_components_from_toplevel(log_pfx)

Remove toplevel attribute for seq_regions that are "components" (parts of different seq_regions).

SQL code.

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
def remove_components_from_toplevel(self, log_pfx):
    """
    Remove toplevel attribute for seq_regions that are "components" (parts of different seq_regions).

    SQL code.
    """

    # get list of seq_regions that are components
    sql_not_toplevel_list = r"""
      select distinct a.cmp_seq_region_id, sr_c.name, cs_c.name
        from assembly a,
             seq_region sr_c, seq_region sr_a,
             coord_system cs_c, coord_system cs_a
        where a.cmp_seq_region_id = sr_c.seq_region_id
          and a.asm_seq_region_id = sr_a.seq_region_id
          and sr_c.coord_system_id = cs_c.coord_system_id
          and sr_a.coord_system_id = cs_a.coord_system_id
          and cs_c.attrib like "%default_version%"
          and cs_a.attrib like "%default_version%"
          and sr_c.seq_region_id in (
            select distinct seq_region_id
              from seq_region_attrib
              where attrib_type_id = 6 and value = 1
          )
    """
    # perhaps, make sense to check sr_c.coord_system_id != sr_a.coord_system_id
    self.run_sql_req(sql_not_toplevel_list, ".".join([log_pfx, "not_toplevel_list"]))

    # delete wrongly assigned attribs
    sql_not_toplevel_delete = r"""
      delete from seq_region_attrib
        where attrib_type_id = 6 and value = 1
          and seq_region_id in (
            select distinct a.cmp_seq_region_id
              from assembly a,
                   seq_region sr_c, seq_region sr_a,
                   coord_system cs_c, coord_system cs_a
              where a.cmp_seq_region_id = sr_c.seq_region_id
                and a.asm_seq_region_id = sr_a.seq_region_id
                and sr_c.coord_system_id = cs_c.coord_system_id
                and sr_a.coord_system_id = cs_a.coord_system_id
                and cs_c.attrib like "%default_version%"
                and cs_a.attrib like "%default_version%"
          );
    """
    # perhaps, check sr_c.coord_system_id != sr_a.coord_system_id
    self.run_sql_req(sql_not_toplevel_delete, ".".join([log_pfx, "not_toplevel_delete"]))

run()

Entry point for the Ehive module. All processing is done here in this case.

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
def run(self):
    """
    Entry point for the Ehive module. All processing is done here in this case.
    """
    # params
    work_dir = self.param_required("work_dir")

    # initial sequence loading, using ensembl-analysis scripts
    self.initial_sequence_loading(work_dir)

    # load data from the corresponding core db tables
    external_db_map = self.load_map_from_core_db(
        "external_db", ["db_name", "external_db_id"], work_dir
    )  # for external_db
    attrib_type_map = self.load_map_from_core_db(
        "attrib_type", ["code", "attrib_type_id"], work_dir
    )  # for attrib_type
    seq_region_map = self.load_map_from_core_db(
        "seq_region", ["name", "seq_region_id"], work_dir
    )  # for seq_region

    # update synonyms and seq_region_attribs
    unversion = self.param("unversion_scaffolds")
    is_primary_assembly = self.from_param("manifest_data", "agp", not_throw=True) is None
    seq_region_file = self.from_param("manifest_data", "seq_region", not_throw=True)

    #   add seq_region synonyms
    self.add_sr_synonyms(
        seq_region_file,
        seq_region_map,
        external_db_map,
        self.pjc(work_dir, "seq_region_syns"),
        unversion=unversion,
    )

    #   add seq_region attributes
    self.add_sr_attribs(
        seq_region_file,
        seq_region_map,
        attrib_type_map,
        self.pjc(work_dir, "seq_region_attr"),
        unversion=unversion,
    )

    #   add seq_region EBI and BRC4 name attributes in the "BRC4 mode"
    #     special case of attributes adding with default values derived from seq_region names
    #     do not add if preparing to swap RefSeq and GeneBank ids; in this case attributes to be added at a later stage in pipeline
    #     (easier to insert then to update)
    if self.param("brc4_mode") and not self.param("swap_gcf_gca"):
        self.add_sr_ebi_brc4_names(
            seq_region_file,
            seq_region_map,
            attrib_type_map,
            self.pjc(work_dir, "seq_region_ebi_brc4_name"),
            unversion=unversion,
        )

    # add karyotype related data
    self.add_karyotype_data(
        seq_region_file,
        seq_region_map,
        attrib_type_map,
        self.pjc(work_dir, "karyotype"),
        unversion=unversion,
    )

run_sql_req(sql, log_pfx, from_file=False)

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
def run_sql_req(self, sql, log_pfx, from_file=False):
    os.makedirs(dirname(log_pfx), exist_ok=True)

    sql_option = r""" -sql '{_sql}' """.format(_sql=sql)
    if from_file:
        sql_option = r""" < '{_sql}' """.format(_sql=sql)

    cmd = r"""{_dbcmd} -url "{_srv}{_dbname}" {_sql_option} > {_out} 2> {_err}""".format(
        _dbcmd="perl %s/scripts/db_cmd.pl" % os.getenv("EHIVE_ROOT_DIR"),
        _srv=self.param("dbsrv_url"),
        _dbname=self.param("db_name"),
        _sql_option=sql_option,
        _out=log_pfx + ".stdout",
        _err=log_pfx + ".stderr",
    )
    print("running %s" % (cmd), file=sys.stderr)
    return sp.run(cmd, shell=True, check=True)

set_toplevel(log_pfx, ignored_cs=[])

Set toplevel(6) seq_region_attrib using ensembl script.

Uses set_toplevel.pl ensembl script.

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
def set_toplevel(self, log_pfx, ignored_cs=[]):
    """
    Set toplevel(6) seq_region_attrib using ensembl script.

    Uses set_toplevel.pl ensembl script.
    """
    # set top_level(6) seq_region_attrib
    os.makedirs(dirname(log_pfx), exist_ok=True)
    en_root = self.param_required("ensembl_root_dir")
    cmd = (
        r"""{_set_tl} {_db_string} {_ignored_cs} """ + r"""     > {_log}.stdout 2> {_log}.stderr"""
    ).format(
        _set_tl="perl %s"
        % (self.pjc(en_root, r"ensembl-analysis/scripts/assembly_loading/set_toplevel.pl")),
        _db_string=self.db_string(),
        _ignored_cs=" ".join(map(lambda x: "-ignore_coord_system %s" % (x), ignored_cs)),
        _log=log_pfx,
    )
    print("running %s" % (cmd), file=sys.stderr)
    sp.run(cmd, shell=True, check=True)

    # remove toplevel attribute for seq_regions that are components
    self.remove_components_from_toplevel(log_pfx)

sr_name_unversion(cs, tbl, fld, log_pfx)

Remove version suffix from the seq_region names

Removes '.\d+$' suffices from the seq_region names SQL code.

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
def sr_name_unversion(self, cs, tbl, fld, log_pfx):
    """
    Remove version suffix from the seq_region names

    Removes '\.\d+$' suffices from the seq_region names
    SQL code.
    """
    # select synonym, substr(synonym,  1, locate(".", synonym, length(synonym)-2)-1)
    #     from seq_region_synonym  where synonym like "%._"
    asm_v = self.asm_name()
    sql = r"""update {_tbl} t, seq_region sr, coord_system cs
                set
                  t.{_fld} = substr(t.{_fld},  1, locate(".", t.{_fld}, length(t.{_fld})-2)-1)
                where t.{_fld} like "%._"
                  and t.seq_region_id = sr.seq_region_id
                  and sr.coord_system_id = cs.coord_system_id
                  and cs.name = "{_cs}"
                  and cs.version = "{_asm_v}"
            ;""".format(
        _tbl=tbl, _fld=fld, _cs=cs, _asm_v=asm_v
    )
    return self.run_sql_req(sql, log_pfx)

unversion_scaffolds(cs_rank, logs)

Unversion scaffold, remove ".\d$" from seq_region.names if there's a need

Non-versioned syns for contigs (lower, sequence level), versioned for the rest.

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
def unversion_scaffolds(self, cs_rank, logs):
    """
    Unversion scaffold, remove ".\d$" from seq_region.names if there's a need

    Non-versioned syns for contigs (lower, sequence level), versioned for the rest.
    """
    # coord_systems to ignore when adding synonyms for sequence_level cs
    artificial_cs = frozenset(self.param("artificial_cs").split(","))
    seq_cs, max_rank = max(
        [(c, r) for c, r in cs_rank.items() if c not in artificial_cs], key=lambda k: k[1]
    )
    for cs in cs_rank:
        if cs == seq_cs:
            # for non-sequence level cs, store original name (with version) as "sr_syn_src" synonym
            xdb = self.param("sr_syn_src")
            self.copy_sr_name_to_syn(cs, xdb, self.pjc(logs, "cp2syn", cs))
            self.sr_name_unversion(cs, "seq_region_synonym", "synonym", self.pjc(logs, "unv_srs", cs))
        else:
            # for sequence-level cs, store original name (with version) as "versioned_sr_syn_src" synonym
            xdb = self.param("versioned_sr_syn_src")
            self.copy_sr_name_to_syn(cs, xdb, self.pjc(logs, "cp2syn", cs))
            self.sr_name_unversion(cs, "seq_region", "name", self.pjc(logs, "unv_sr", cs))

update_db_single_group(dict_of_col_to_value, table_name, work_dir, where=None)

Update given table name in db; set col = val for all key/value pairs from dict_of_cols_to_values

If where condition is present its value is used for the "WHERE" SQL clause. Use quote_or_null (see definition below) method for string values, when putting values into list_of_tuples

SQL code

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
def update_db_single_group(
    self, dict_of_col_to_value: dict, table_name: str, work_dir: str, where: str = None
):
    """
    Update given `table` name in db; set `col = val` for all key/value pairs from `dict_of_cols_to_values`

    If `where` condition is present its value is used for the "WHERE" SQL clause.
    Use `quote_or_null` (see definition below) method for string values, when putting values into `list_of_tuples`

    SQL code
    """
    # return if nothing to do
    if not dict_of_col_to_value:
        return

    # prepare request parts
    where_str = where and f"WHERE {where}" or ""
    col_val_str = ", ".join([f"{col} = {val}" for col, val in dict_of_col_to_value.items()])

    # generate file with the insert SQL command
    update_sql_file = self.pjc(work_dir, "update.sql")
    with open(insert_sql_file, "w") as sql:
        print(f"UPDATE {table_name} SET {col_val_str} {where_str};", file=sql)
        values_sep = ""
        for tpl in list_of_tuples:
            tpl_str = ", ".join(map(str, tpl))
            print(f"{values_sep}({tpl_str})", file=sql)
            values_sep = ", "
        print(";", file=sql)

    # run insert SQL from file
    self.run_sql_req(update_sql_file, self.pjc(work_dir, "update"), from_file=True)

used_cs_ranks(agps, cs_order, noagp_default=None)

Source code in src/python/ensembl/brc4/runnable/load_sequence_data.py
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
def used_cs_ranks(self, agps, cs_order, noagp_default=None):
    # rank cs_names, met in agps.keys ("-" separated), i.e. "scaffold-contig"
    #   only agps keys used, values are ignored
    #   use noagp_cs_name_default for "noagp" assemblies
    if agps is None:
        if noagp_default is None:
            raise Exception("NoAGP assembly with no default coordinate system name")
        cs_used_set = frozenset([noagp_default])
    else:
        cs_used_set = frozenset(sum(map(lambda x: x.split("-"), agps.keys()), []))

    cs_unknown = cs_used_set.difference(cs_order.keys())
    if len(cs_unknown) > 0:
        raise Exception("Unknown coordinate system(s) %s" % {str(cs_unknown)})
    return {e: i for i, e in enumerate(sorted(cs_used_set, key=lambda x: -cs_order[x]), start=1)}