diff --git a/.gitignore b/.gitignore index 74a7b99b5c3ded20292a5509436601ca3ee36790..cf2817aecc6ae86c9d0f7f2cab2e8f9d5356996f 100644 --- a/.gitignore +++ b/.gitignore @@ -3,13 +3,19 @@ corpora/cirm/mapped_habitats.txt corpora/cirm/mapped_taxids.txt corpora/cirm/mapped_yeast_habitats.txt corpora/cirm/mapped_yeast_taxa.txt +corpora/cirm/mapped_cfbp_habitats.txt +corpora/cirm/mapped_cfbp_taxa.txt corpora/cirm/taxa.txt corpora/cirm/test-yeast.txt corpora/cirm/test.txt corpora/cirm/yeast_habitats.txt corpora/cirm/yeast_taxa.txt +corpora/cirm/cfbp_habitats.txt +corpora/cirm/cfbp_taxa.txt corpora/cirm/Levures_2021/Florilege_21012021.tsv corpora/cirm/BIA_2021/florilege_export_final_17_02_21.tsv +corpora/cirm/CFBP_2020/CFPB_22_sept_2020_Pathotype.tsv +corpora/cirm/CFBP_2020/CFPB_22_sept_2020_Type.tsv corpora/dsmz/habitats.txt corpora/dsmz/mapped_habitats.txt corpora/dsmz/mapped_taxids.txt @@ -69,3 +75,4 @@ yatea-train/ softwares/alvisir-install/ softwares/*.sif softwares/obo-utils +log/ diff --git a/corpora/cirm/CFBP_2020/CFPB_22_sept_2020_Pathotype.xlsx b/corpora/cirm/CFBP_2020/CFPB_22_sept_2020_Pathotype.xlsx new file mode 100755 index 0000000000000000000000000000000000000000..faec4fdd0c3d8602a81c51958085387f25bd4f0f Binary files /dev/null and b/corpora/cirm/CFBP_2020/CFPB_22_sept_2020_Pathotype.xlsx differ diff --git a/corpora/cirm/CFBP_2020/CFPB_22_sept_2020_Type.xlsx b/corpora/cirm/CFBP_2020/CFPB_22_sept_2020_Type.xlsx new file mode 100755 index 0000000000000000000000000000000000000000..f4b1962bd3283627180014102f4c31e1cdd4470f Binary files /dev/null and b/corpora/cirm/CFBP_2020/CFPB_22_sept_2020_Type.xlsx differ diff --git a/docs/3-pipeline.svg b/docs/3-pipeline.svg index 08a1334f7c77de730d559465ae1228b65b8e0092..2a89b15f2cfd166ddf57745d593c2a71bea7195a 100644 --- a/docs/3-pipeline.svg +++ b/docs/3-pipeline.svg @@ -4,115 +4,165 @@ <!-- Generated by graphviz version 2.38.0 (20140413.2041) --> <!-- Title: snakemake_dag Pages: 1 --> -<svg width="647pt" height="260pt" - viewBox="0.00 0.00 647.00 260.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"> +<svg width="1109pt" height="260pt" + viewBox="0.00 0.00 1109.18 260.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"> <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 256)"> <title>snakemake_dag</title> -<polygon fill="white" stroke="none" points="-4,4 -4,-256 643,-256 643,4 -4,4"/> +<polygon fill="white" stroke="none" points="-4,4 -4,-256 1105.18,-256 1105.18,4 -4,4"/> <!-- 0 --> <g id="node1" class="node"><title>0</title> -<path fill="none" stroke="#56d86b" stroke-width="2" d="M319.5,-36C319.5,-36 289.5,-36 289.5,-36 283.5,-36 277.5,-30 277.5,-24 277.5,-24 277.5,-12 277.5,-12 277.5,-6 283.5,-0 289.5,-0 289.5,-0 319.5,-0 319.5,-0 325.5,-0 331.5,-6 331.5,-12 331.5,-12 331.5,-24 331.5,-24 331.5,-30 325.5,-36 319.5,-36"/> -<text text-anchor="middle" x="304.5" y="-15.5" font-family="sans" font-size="10.00">all</text> +<path fill="none" stroke="#68d856" stroke-width="2" d="M475.684,-36C475.684,-36 445.684,-36 445.684,-36 439.684,-36 433.684,-30 433.684,-24 433.684,-24 433.684,-12 433.684,-12 433.684,-6 439.684,-0 445.684,-0 445.684,-0 475.684,-0 475.684,-0 481.684,-0 487.684,-6 487.684,-12 487.684,-12 487.684,-24 487.684,-24 487.684,-30 481.684,-36 475.684,-36"/> +<text text-anchor="middle" x="460.684" y="-15.5" font-family="sans" font-size="10.00">all</text> </g> <!-- 1 --> <g id="node2" class="node"><title>1</title> -<path fill="none" stroke="#70d856" stroke-width="2" d="M268,-108C268,-108 177,-108 177,-108 171,-108 165,-102 165,-96 165,-96 165,-84 165,-84 165,-78 171,-72 177,-72 177,-72 268,-72 268,-72 274,-72 280,-78 280,-84 280,-84 280,-96 280,-96 280,-102 274,-108 268,-108"/> -<text text-anchor="middle" x="222.5" y="-87.5" font-family="sans" font-size="10.00">format_cirm_results</text> +<path fill="none" stroke="#8fd856" stroke-width="2" d="M225.184,-108C225.184,-108 134.184,-108 134.184,-108 128.184,-108 122.184,-102 122.184,-96 122.184,-96 122.184,-84 122.184,-84 122.184,-78 128.184,-72 134.184,-72 134.184,-72 225.184,-72 225.184,-72 231.184,-72 237.184,-78 237.184,-84 237.184,-84 237.184,-96 237.184,-96 237.184,-102 231.184,-108 225.184,-108"/> +<text text-anchor="middle" x="179.684" y="-87.5" font-family="sans" font-size="10.00">format_cirm_results</text> </g> <!-- 1->0 --> <g id="edge1" class="edge"><title>1->0</title> -<path fill="none" stroke="grey" stroke-width="2" d="M242.77,-71.6966C252.998,-62.9655 265.561,-52.2405 276.678,-42.7503"/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="279.13,-45.259 284.463,-36.1043 274.585,-39.935 279.13,-45.259"/> +<path fill="none" stroke="grey" stroke-width="2" d="M237.227,-74.6655C293.024,-60.7659 376.017,-40.0915 423.586,-28.2415"/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="424.603,-31.5952 433.46,-25.7817 422.911,-24.8028 424.603,-31.5952"/> </g> <!-- 2 --> <g id="node3" class="node"><title>2</title> -<path fill="none" stroke="#d88556" stroke-width="2" d="M448,-108C448,-108 325,-108 325,-108 319,-108 313,-102 313,-96 313,-96 313,-84 313,-84 313,-78 319,-72 325,-72 325,-72 448,-72 448,-72 454,-72 460,-78 460,-84 460,-84 460,-96 460,-96 460,-102 454,-108 448,-108"/> -<text text-anchor="middle" x="386.5" y="-87.5" font-family="sans" font-size="10.00">format_cirm_yeast_results</text> +<path fill="none" stroke="#d85656" stroke-width="2" d="M522.184,-108C522.184,-108 399.184,-108 399.184,-108 393.184,-108 387.184,-102 387.184,-96 387.184,-96 387.184,-84 387.184,-84 387.184,-78 393.184,-72 399.184,-72 399.184,-72 522.184,-72 522.184,-72 528.184,-72 534.184,-78 534.184,-84 534.184,-84 534.184,-96 534.184,-96 534.184,-102 528.184,-108 522.184,-108"/> +<text text-anchor="middle" x="460.684" y="-87.5" font-family="sans" font-size="10.00">format_cirm_yeast_results</text> </g> <!-- 2->0 --> <g id="edge2" class="edge"><title>2->0</title> -<path fill="none" stroke="grey" stroke-width="2" d="M366.23,-71.6966C356.002,-62.9655 343.439,-52.2405 332.322,-42.7503"/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="334.415,-39.935 324.537,-36.1043 329.87,-45.259 334.415,-39.935"/> +<path fill="none" stroke="grey" stroke-width="2" d="M460.684,-71.6966C460.684,-63.9827 460.684,-54.7125 460.684,-46.1124"/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="464.184,-46.1043 460.684,-36.1043 457.184,-46.1044 464.184,-46.1043"/> </g> <!-- 3 --> <g id="node4" class="node"><title>3</title> -<path fill="none" stroke="#56b1d8" stroke-width="2" d="M137,-180C137,-180 12,-180 12,-180 6,-180 1.42109e-14,-174 1.42109e-14,-168 1.42109e-14,-168 1.42109e-14,-156 1.42109e-14,-156 1.42109e-14,-150 6,-144 12,-144 12,-144 137,-144 137,-144 143,-144 149,-150 149,-156 149,-156 149,-168 149,-168 149,-174 143,-180 137,-180"/> -<text text-anchor="middle" x="74.5" y="-159.5" font-family="sans" font-size="10.00">map_cirm_microorganisms</text> +<path fill="none" stroke="#56a9d8" stroke-width="2" d="M858.184,-108C858.184,-108 741.184,-108 741.184,-108 735.184,-108 729.184,-102 729.184,-96 729.184,-96 729.184,-84 729.184,-84 729.184,-78 735.184,-72 741.184,-72 741.184,-72 858.184,-72 858.184,-72 864.184,-72 870.184,-78 870.184,-84 870.184,-84 870.184,-96 870.184,-96 870.184,-102 864.184,-108 858.184,-108"/> +<text text-anchor="middle" x="799.684" y="-87.5" font-family="sans" font-size="10.00">format_cirm_cfbp_results</text> </g> -<!-- 3->1 --> -<g id="edge3" class="edge"><title>3->1</title> -<path fill="none" stroke="grey" stroke-width="2" d="M110.705,-143.876C130.758,-134.392 155.872,-122.513 177.214,-112.419"/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="178.869,-115.508 186.413,-108.068 175.876,-109.18 178.869,-115.508"/> +<!-- 3->0 --> +<g id="edge3" class="edge"><title>3->0</title> +<path fill="none" stroke="grey" stroke-width="2" d="M729.031,-74.4109C658.665,-59.881 553.362,-38.1371 497.91,-26.6867"/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="498.333,-23.2003 487.832,-24.6057 496.917,-30.0557 498.333,-23.2003"/> </g> <!-- 4 --> <g id="node5" class="node"><title>4</title> -<path fill="none" stroke="#56d8c9" stroke-width="2" d="M265.5,-180C265.5,-180 179.5,-180 179.5,-180 173.5,-180 167.5,-174 167.5,-168 167.5,-168 167.5,-156 167.5,-156 167.5,-150 173.5,-144 179.5,-144 179.5,-144 265.5,-144 265.5,-144 271.5,-144 277.5,-150 277.5,-156 277.5,-156 277.5,-168 277.5,-168 277.5,-174 271.5,-180 265.5,-180"/> -<text text-anchor="middle" x="222.5" y="-159.5" font-family="sans" font-size="10.00">map_cirm_habitats</text> +<path fill="none" stroke="#56d892" stroke-width="2" d="M164.684,-252C164.684,-252 36.6842,-252 36.6842,-252 30.6842,-252 24.6842,-246 24.6842,-240 24.6842,-240 24.6842,-228 24.6842,-228 24.6842,-222 30.6842,-216 36.6842,-216 36.6842,-216 164.684,-216 164.684,-216 170.684,-216 176.684,-222 176.684,-228 176.684,-228 176.684,-240 176.684,-240 176.684,-246 170.684,-252 164.684,-252"/> +<text text-anchor="middle" x="100.684" y="-231.5" font-family="sans" font-size="10.00">get_cirm_bia_taxa_habitats</text> </g> <!-- 4->1 --> <g id="edge4" class="edge"><title>4->1</title> -<path fill="none" stroke="grey" stroke-width="2" d="M222.5,-143.697C222.5,-135.983 222.5,-126.712 222.5,-118.112"/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="226,-118.104 222.5,-108.104 219,-118.104 226,-118.104"/> +<path fill="none" stroke="grey" stroke-width="2" d="M49.0357,-215.965C32.5446,-207.803 16.1342,-196.177 6.68419,-180 -1.38637,-166.185 -2.82928,-156.864 6.68419,-144 30.6711,-111.564 74.1296,-98.3336 111.487,-93.2024"/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="112.328,-96.6266 121.84,-91.9606 111.494,-89.6764 112.328,-96.6266"/> </g> <!-- 5 --> <g id="node6" class="node"><title>5</title> -<path fill="none" stroke="#56d89a" stroke-width="2" d="M465.5,-180C465.5,-180 307.5,-180 307.5,-180 301.5,-180 295.5,-174 295.5,-168 295.5,-168 295.5,-156 295.5,-156 295.5,-150 301.5,-144 307.5,-144 307.5,-144 465.5,-144 465.5,-144 471.5,-144 477.5,-150 477.5,-156 477.5,-156 477.5,-168 477.5,-168 477.5,-174 471.5,-180 465.5,-180"/> -<text text-anchor="middle" x="386.5" y="-159.5" font-family="sans" font-size="10.00">map_cirm_yeast_microorganisms</text> +<path fill="none" stroke="#56d0d8" stroke-width="2" d="M173.684,-180C173.684,-180 27.6842,-180 27.6842,-180 21.6842,-180 15.6842,-174 15.6842,-168 15.6842,-168 15.6842,-156 15.6842,-156 15.6842,-150 21.6842,-144 27.6842,-144 27.6842,-144 173.684,-144 173.684,-144 179.684,-144 185.684,-150 185.684,-156 185.684,-156 185.684,-168 185.684,-168 185.684,-174 179.684,-180 173.684,-180"/> +<text text-anchor="middle" x="100.684" y="-159.5" font-family="sans" font-size="10.00">map_cirm_bia_microorganisms</text> </g> -<!-- 5->2 --> -<g id="edge5" class="edge"><title>5->2</title> -<path fill="none" stroke="grey" stroke-width="2" d="M386.5,-143.697C386.5,-135.983 386.5,-126.712 386.5,-118.112"/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="390,-118.104 386.5,-108.104 383,-118.104 390,-118.104"/> +<!-- 4->5 --> +<g id="edge13" class="edge"><title>4->5</title> +<path fill="none" stroke="grey" stroke-width="2" d="M100.684,-215.697C100.684,-207.983 100.684,-198.712 100.684,-190.112"/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="104.184,-190.104 100.684,-180.104 97.1843,-190.104 104.184,-190.104"/> </g> <!-- 6 --> <g id="node7" class="node"><title>6</title> -<path fill="none" stroke="#d8b456" stroke-width="2" d="M627,-180C627,-180 508,-180 508,-180 502,-180 496,-174 496,-168 496,-168 496,-156 496,-156 496,-150 502,-144 508,-144 508,-144 627,-144 627,-144 633,-144 639,-150 639,-156 639,-156 639,-168 639,-168 639,-174 633,-180 627,-180"/> -<text text-anchor="middle" x="567.5" y="-159.5" font-family="sans" font-size="10.00">map_cirm_yeast_habitats</text> +<path fill="none" stroke="#56d8b9" stroke-width="2" d="M301.684,-180C301.684,-180 215.684,-180 215.684,-180 209.684,-180 203.684,-174 203.684,-168 203.684,-168 203.684,-156 203.684,-156 203.684,-150 209.684,-144 215.684,-144 215.684,-144 301.684,-144 301.684,-144 307.684,-144 313.684,-150 313.684,-156 313.684,-156 313.684,-168 313.684,-168 313.684,-174 307.684,-180 301.684,-180"/> +<text text-anchor="middle" x="258.684" y="-159.5" font-family="sans" font-size="10.00">map_cirm_habitats</text> </g> -<!-- 6->2 --> -<g id="edge6" class="edge"><title>6->2</title> -<path fill="none" stroke="grey" stroke-width="2" d="M523.222,-143.876C498.149,-134.179 466.604,-121.98 440.135,-111.743"/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="441.223,-108.411 430.634,-108.068 438.698,-114.94 441.223,-108.411"/> +<!-- 4->6 --> +<g id="edge14" class="edge"><title>4->6</title> +<path fill="none" stroke="grey" stroke-width="2" d="M139.335,-215.876C160.935,-206.307 188.035,-194.3 210.951,-184.148"/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="212.433,-187.319 220.159,-180.068 209.598,-180.919 212.433,-187.319"/> +</g> +<!-- 5->1 --> +<g id="edge5" class="edge"><title>5->1</title> +<path fill="none" stroke="grey" stroke-width="2" d="M120.212,-143.697C130.066,-134.965 142.17,-124.24 152.88,-114.75"/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="155.217,-117.356 160.381,-108.104 150.575,-112.117 155.217,-117.356"/> +</g> +<!-- 6->1 --> +<g id="edge6" class="edge"><title>6->1</title> +<path fill="none" stroke="grey" stroke-width="2" d="M239.156,-143.697C229.302,-134.965 217.198,-124.24 206.488,-114.75"/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="208.793,-112.117 198.988,-108.104 204.151,-117.356 208.793,-112.117"/> </g> <!-- 7 --> <g id="node8" class="node"><title>7</title> -<path fill="none" stroke="#5682d8" stroke-width="2" d="M105.5,-252C105.5,-252 43.5,-252 43.5,-252 37.5,-252 31.5,-246 31.5,-240 31.5,-240 31.5,-228 31.5,-228 31.5,-222 37.5,-216 43.5,-216 43.5,-216 105.5,-216 105.5,-216 111.5,-216 117.5,-222 117.5,-228 117.5,-228 117.5,-240 117.5,-240 117.5,-246 111.5,-252 105.5,-252"/> -<text text-anchor="middle" x="74.5" y="-231.5" font-family="sans" font-size="10.00">get_cirm_taxa</text> +<path fill="none" stroke="#567bd8" stroke-width="2" d="M530.684,-252C530.684,-252 390.684,-252 390.684,-252 384.684,-252 378.684,-246 378.684,-240 378.684,-240 378.684,-228 378.684,-228 378.684,-222 384.684,-216 390.684,-216 390.684,-216 530.684,-216 530.684,-216 536.684,-216 542.684,-222 542.684,-228 542.684,-228 542.684,-240 542.684,-240 542.684,-246 536.684,-252 530.684,-252"/> +<text text-anchor="middle" x="460.684" y="-231.5" font-family="sans" font-size="10.00">get_cirm_yeast_taxa_habitats</text> </g> -<!-- 7->3 --> -<g id="edge7" class="edge"><title>7->3</title> -<path fill="none" stroke="grey" stroke-width="2" d="M74.5,-215.697C74.5,-207.983 74.5,-198.712 74.5,-190.112"/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="78.0001,-190.104 74.5,-180.104 71.0001,-190.104 78.0001,-190.104"/> +<!-- 7->2 --> +<g id="edge7" class="edge"><title>7->2</title> +<path fill="none" stroke="grey" stroke-width="2" d="M404.248,-215.96C387.248,-207.895 370.511,-196.33 360.684,-180 352.434,-166.291 352.434,-157.709 360.684,-144 368.746,-130.604 381.457,-120.415 395.148,-112.734"/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="396.965,-115.735 404.248,-108.04 393.756,-109.514 396.965,-115.735"/> </g> <!-- 8 --> <g id="node9" class="node"><title>8</title> -<path fill="none" stroke="#d85656" stroke-width="2" d="M260,-252C260,-252 185,-252 185,-252 179,-252 173,-246 173,-240 173,-240 173,-228 173,-228 173,-222 179,-216 185,-216 185,-216 260,-216 260,-216 266,-216 272,-222 272,-228 272,-228 272,-240 272,-240 272,-246 266,-252 260,-252"/> -<text text-anchor="middle" x="222.5" y="-231.5" font-family="sans" font-size="10.00">get_cirm_habitat</text> +<path fill="none" stroke="#56d86b" stroke-width="2" d="M539.684,-180C539.684,-180 381.684,-180 381.684,-180 375.684,-180 369.684,-174 369.684,-168 369.684,-168 369.684,-156 369.684,-156 369.684,-150 375.684,-144 381.684,-144 381.684,-144 539.684,-144 539.684,-144 545.684,-144 551.684,-150 551.684,-156 551.684,-156 551.684,-168 551.684,-168 551.684,-174 545.684,-180 539.684,-180"/> +<text text-anchor="middle" x="460.684" y="-159.5" font-family="sans" font-size="10.00">map_cirm_yeast_microorganisms</text> </g> -<!-- 8->4 --> -<g id="edge8" class="edge"><title>8->4</title> -<path fill="none" stroke="grey" stroke-width="2" d="M222.5,-215.697C222.5,-207.983 222.5,-198.712 222.5,-190.112"/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="226,-190.104 222.5,-180.104 219,-190.104 226,-190.104"/> +<!-- 7->8 --> +<g id="edge15" class="edge"><title>7->8</title> +<path fill="none" stroke="grey" stroke-width="2" d="M460.684,-215.697C460.684,-207.983 460.684,-198.712 460.684,-190.112"/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="464.184,-190.104 460.684,-180.104 457.184,-190.104 464.184,-190.104"/> </g> <!-- 9 --> <g id="node10" class="node"><title>9</title> -<path fill="none" stroke="#9fd856" stroke-width="2" d="M434,-252C434,-252 339,-252 339,-252 333,-252 327,-246 327,-240 327,-240 327,-228 327,-228 327,-222 333,-216 339,-216 339,-216 434,-216 434,-216 440,-216 446,-222 446,-228 446,-228 446,-240 446,-240 446,-246 440,-252 434,-252"/> -<text text-anchor="middle" x="386.5" y="-231.5" font-family="sans" font-size="10.00">get_cirm_yeast_taxa</text> +<path fill="none" stroke="#d87d56" stroke-width="2" d="M701.184,-180C701.184,-180 582.184,-180 582.184,-180 576.184,-180 570.184,-174 570.184,-168 570.184,-168 570.184,-156 570.184,-156 570.184,-150 576.184,-144 582.184,-144 582.184,-144 701.184,-144 701.184,-144 707.184,-144 713.184,-150 713.184,-156 713.184,-156 713.184,-168 713.184,-168 713.184,-174 707.184,-180 701.184,-180"/> +<text text-anchor="middle" x="641.684" y="-159.5" font-family="sans" font-size="10.00">map_cirm_yeast_habitats</text> +</g> +<!-- 7->9 --> +<g id="edge16" class="edge"><title>7->9</title> +<path fill="none" stroke="grey" stroke-width="2" d="M504.962,-215.876C530.035,-206.179 561.58,-193.98 588.049,-183.743"/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="589.486,-186.94 597.55,-180.068 586.961,-180.411 589.486,-186.94"/> </g> -<!-- 9->5 --> -<g id="edge9" class="edge"><title>9->5</title> -<path fill="none" stroke="grey" stroke-width="2" d="M386.5,-215.697C386.5,-207.983 386.5,-198.712 386.5,-190.112"/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="390,-190.104 386.5,-180.104 383,-190.104 390,-190.104"/> +<!-- 8->2 --> +<g id="edge8" class="edge"><title>8->2</title> +<path fill="none" stroke="grey" stroke-width="2" d="M460.684,-143.697C460.684,-135.983 460.684,-126.712 460.684,-118.112"/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="464.184,-118.104 460.684,-108.104 457.184,-118.104 464.184,-118.104"/> +</g> +<!-- 9->2 --> +<g id="edge9" class="edge"><title>9->2</title> +<path fill="none" stroke="grey" stroke-width="2" d="M597.407,-143.876C572.333,-134.179 540.788,-121.98 514.319,-111.743"/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="515.407,-108.411 504.818,-108.068 512.882,-114.94 515.407,-108.411"/> </g> <!-- 10 --> <g id="node11" class="node"><title>10</title> -<path fill="none" stroke="#ced856" stroke-width="2" d="M621,-252C621,-252 514,-252 514,-252 508,-252 502,-246 502,-240 502,-240 502,-228 502,-228 502,-222 508,-216 514,-216 514,-216 621,-216 621,-216 627,-216 633,-222 633,-228 633,-228 633,-240 633,-240 633,-246 627,-252 621,-252"/> -<text text-anchor="middle" x="567.5" y="-231.5" font-family="sans" font-size="10.00">get_cirm_yeast_habitat</text> -</g> -<!-- 10->6 --> -<g id="edge10" class="edge"><title>10->6</title> -<path fill="none" stroke="grey" stroke-width="2" d="M567.5,-215.697C567.5,-207.983 567.5,-198.712 567.5,-190.112"/> -<polygon fill="grey" stroke="grey" stroke-width="2" points="571,-190.104 567.5,-180.104 564,-190.104 571,-190.104"/> +<path fill="none" stroke="#d8cb56" stroke-width="2" d="M924.684,-252C924.684,-252 790.684,-252 790.684,-252 784.684,-252 778.684,-246 778.684,-240 778.684,-240 778.684,-228 778.684,-228 778.684,-222 784.684,-216 790.684,-216 790.684,-216 924.684,-216 924.684,-216 930.684,-216 936.684,-222 936.684,-228 936.684,-228 936.684,-240 936.684,-240 936.684,-246 930.684,-252 924.684,-252"/> +<text text-anchor="middle" x="857.684" y="-231.5" font-family="sans" font-size="10.00">get_cirm_cfbp_taxa_habitats</text> +</g> +<!-- 10->3 --> +<g id="edge10" class="edge"><title>10->3</title> +<path fill="none" stroke="grey" stroke-width="2" d="M803.665,-215.962C786.91,-207.851 770.327,-196.258 760.684,-180 748.445,-159.365 761.841,-134.189 776.166,-115.887"/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="779.021,-117.928 782.725,-108.002 773.639,-113.452 779.021,-117.928"/> +</g> +<!-- 11 --> +<g id="node12" class="node"><title>11</title> +<path fill="none" stroke="#b6d856" stroke-width="2" d="M933.684,-180C933.684,-180 781.684,-180 781.684,-180 775.684,-180 769.684,-174 769.684,-168 769.684,-168 769.684,-156 769.684,-156 769.684,-150 775.684,-144 781.684,-144 781.684,-144 933.684,-144 933.684,-144 939.684,-144 945.684,-150 945.684,-156 945.684,-156 945.684,-168 945.684,-168 945.684,-174 939.684,-180 933.684,-180"/> +<text text-anchor="middle" x="857.684" y="-159.5" font-family="sans" font-size="10.00">map_cirm_cfbp_microorganisms</text> +</g> +<!-- 10->11 --> +<g id="edge17" class="edge"><title>10->11</title> +<path fill="none" stroke="grey" stroke-width="2" d="M857.684,-215.697C857.684,-207.983 857.684,-198.712 857.684,-190.112"/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="861.184,-190.104 857.684,-180.104 854.184,-190.104 861.184,-190.104"/> +</g> +<!-- 12 --> +<g id="node13" class="node"><title>12</title> +<path fill="none" stroke="#d8a456" stroke-width="2" d="M1089.18,-180C1089.18,-180 976.184,-180 976.184,-180 970.184,-180 964.184,-174 964.184,-168 964.184,-168 964.184,-156 964.184,-156 964.184,-150 970.184,-144 976.184,-144 976.184,-144 1089.18,-144 1089.18,-144 1095.18,-144 1101.18,-150 1101.18,-156 1101.18,-156 1101.18,-168 1101.18,-168 1101.18,-174 1095.18,-180 1089.18,-180"/> +<text text-anchor="middle" x="1032.68" y="-159.5" font-family="sans" font-size="10.00">map_cirm_cfbp_habitats</text> +</g> +<!-- 10->12 --> +<g id="edge18" class="edge"><title>10->12</title> +<path fill="none" stroke="grey" stroke-width="2" d="M900.494,-215.876C924.63,-206.222 954.968,-194.087 980.49,-183.878"/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="982.029,-187.032 990.013,-180.068 979.429,-180.533 982.029,-187.032"/> +</g> +<!-- 11->3 --> +<g id="edge11" class="edge"><title>11->3</title> +<path fill="none" stroke="grey" stroke-width="2" d="M843.347,-143.697C836.394,-135.305 827.914,-125.07 820.284,-115.861"/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="822.932,-113.571 813.856,-108.104 817.541,-118.038 822.932,-113.571"/> +</g> +<!-- 12->3 --> +<g id="edge12" class="edge"><title>12->3</title> +<path fill="none" stroke="grey" stroke-width="2" d="M975.984,-143.966C943.004,-134.057 901.252,-121.514 866.659,-111.121"/> +<polygon fill="grey" stroke="grey" stroke-width="2" points="867.223,-107.636 856.639,-108.111 865.209,-114.34 867.223,-107.636"/> </g> </g> </svg> diff --git a/docs/3-process-cirm-data.md b/docs/3-process-cirm-data.md index b161dcb7b1500fdd082546a0a0406cd7cbafc8cb..38c0554686522b82e06be609d2f04d609c024942 100644 --- a/docs/3-process-cirm-data.md +++ b/docs/3-process-cirm-data.md @@ -1,5 +1,5 @@ ## About -The pipeline extracts microorganisms, habitats from CIRM-BIA and CIRM-Levures data sources. +The pipeline extracts microorganisms, habitats from CIRM-BIA, CIRM-Levures and CIRM CFBP data sources. <img align="right" width="460" src="3-pipeline.svg"> <!----> @@ -38,11 +38,13 @@ The pipeline handles the following resources : * inputs * `corpora/cirm/BIA_2021/florilege_export_final_17_02_21.xlsx` * `corpora/cirm/Levures_2021/Florilege_21012021.xlsx` + * `corpora/cirm/CFBP_2020/CFPB_22_sept_2020_Type.xlsx` * `ancillaries/OntoBiotope_BioNLP-OST-2019-Habitat.obo` * `ancillaries/OntoBiotope_BioNLP-OST-2019-Phenotype.obo` ? * outputs - * `corpora/florilege/cirm/cirm-results.txt` + * `corpora/florilege/cirm/cirm-bia-results.txt` * `corpora/florilege/cirm/cirm-yeast-results.txt` + * `corpora/florilege/cirm/cirm-cfpb-results.txt` * programs * `alvisnlp singularity container` * `python env` diff --git a/process_CIRM_corpus.snakefile b/process_CIRM_corpus.snakefile index 4dc3bdf08755701b7034db80b7dcd9efa0c27057..3da26f833a9c8b6513c2dc0191ea88643b934f56 100644 --- a/process_CIRM_corpus.snakefile +++ b/process_CIRM_corpus.snakefile @@ -9,13 +9,14 @@ all ''' rule all: input: - 'corpora/florilege/cirm/cirm-results.txt', - 'corpora/florilege/cirm/cirm-yeast-results.txt' + 'corpora/florilege/cirm/cirm-bia-results.txt', + 'corpora/florilege/cirm/cirm-yeast-results.txt', + 'corpora/florilege/cirm/cirm-cfbp-results.txt' ''' get taxa and habitats (CIRM BIA) ''' -rule get_cirm_taxa: +rule get_cirm_bia_taxa_habitats: input: file='corpora/cirm/BIA_2021/florilege_export_final_17_02_21.xlsx' params: @@ -23,8 +24,8 @@ rule get_cirm_taxa: strain_index='1', habitat_index='15' output: - taxa='corpora/cirm/taxa.txt', - habitats='corpora/cirm/habitats.txt', + taxa='corpora/cirm/bia_taxa.txt', + habitats='corpora/cirm/bia_habitats.txt', tsv='corpora/cirm/BIA_2021/florilege_export_final_17_02_21.tsv' conda: 'softwares/envs/python3_pandas_env.yaml' shell: """ @@ -34,7 +35,7 @@ rule get_cirm_taxa: ''' get taxa and habitats (CIRM Levures) ''' -rule get_cirm_yeast_taxa: +rule get_cirm_yeast_taxa_habitats: input: file='corpora/cirm/Levures_2021/Florilege_21012021.xlsx' params: @@ -49,14 +50,34 @@ rule get_cirm_yeast_taxa: python3.7 softwares/Florilege/scripts/preprocess-cirm-yeast.py --input {input.file} --taxa-index {params.taxa_index} --habitat-index {params.habitat_index} --taxa-outfile {output.taxa} --habitat-outfile {output.habitats} --tsv-outfile {output.tsv} """ +''' +get taxa and habitats (CIRM CFBP) +''' +rule get_cirm_cfbp_taxa_habitats: + input: + file='corpora/cirm/CFBP_2020/CFPB_22_sept_2020_Type.xlsx' + params: + taxa_index='1', + strain_index='0', + habitat_index='9,10,14,23' + output: + taxa='corpora/cirm/cfbp_taxa.txt', + habitats='corpora/cirm/cfbp_habitats.txt', + tsv='corpora/cirm/CFBP_2020/CFPB_22_sept_2020_Type.tsv' + conda: 'softwares/envs/python3_pandas_env.yaml' + shell: """ + python3.7 softwares/Florilege/scripts/preprocess-cirm-cfbp.py --input {input.file} --taxa-index {params.taxa_index} --strain-index {params.strain_index} --habitat-index {params.habitat_index} --taxa-outfile {output.taxa} --habitat-outfile {output.habitats} --tsv-outfile {output.tsv} + """ + + ''' map microorganisms ''' -rule map_cirm_microorganisms: +rule map_cirm_bia_microorganisms: input: - taxa='corpora/cirm/taxa.txt' + taxa='corpora/cirm/bia_taxa.txt' output: - mapped_taxaids='corpora/cirm/mapped_taxids.txt' + mapped_taxaids='corpora/cirm/mapped_bia_taxa.txt' params: plan='plans/map_microorganisms.plan', taxid_microorganisms='ancillaries/ncbi-taxonomy-prefix/taxid_microorganisms.txt', @@ -91,14 +112,36 @@ rule map_cirm_yeast_microorganisms: {params.plan} """ + +''' +map microorganisms (CIRM CFBP) +''' +rule map_cirm_cfbp_microorganisms: + input: + taxa='corpora/cirm/cfbp_taxa.txt' + output: + mapped_taxa='corpora/cirm/mapped_cfbp_taxa.txt' + params: + plan='plans/map_microorganisms.plan', + taxid_microorganisms='ancillaries/ncbi-taxonomy-prefix/taxid_microorganisms.txt', + taxa_id_full='ancillaries/ncbi-taxonomy-prefix/taxa+id_full.txt' + singularity:config["SINGULARITY_IMG"] + shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ + -alias input {input.taxa} \ + -alias output {output.mapped_taxa} \ + -alias taxid_microorganisms {params.taxid_microorganisms} \ + -alias taxa+id_full {params.taxa_id_full} \ + {params.plan} + """ + ''' map habitats of microorganisms ''' rule map_cirm_habitats: input: - habitats='corpora/cirm/habitats.txt' + habitats='corpora/cirm/bia_habitats.txt' output: - mapped_habitats='corpora/cirm/mapped_habitats.txt' + mapped_habitats='corpora/cirm/mapped_bia_habitats.txt' params: plan='plans/map_habitats.plan', onto='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', @@ -137,16 +180,39 @@ rule map_cirm_yeast_habitats: {params.plan} """ +''' +map habitats of microorganisms (CIRM CFBP) +''' +rule map_cirm_cfbp_habitats: + input: + habitats='corpora/cirm/cfbp_habitats.txt' + output: + mapped_habitats='corpora/cirm/mapped_cfbp_habitats.txt' + params: + plan='plans/map_habitats.plan', + onto='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', + tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', + graylist='ancillaries/graylist_extended.heads', + emptywords='ancillaries/stopwords_EN.ttg' + singularity:config["SINGULARITY_IMG"] + shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ + -alias input {input.habitats} \ + -alias output {output.mapped_habitats} \ + -alias ontobiotope {params.onto} \ + -xalias '<ontobiotope-tomap empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap}</ontobiotope-tomap>' \ + {params.plan} + """ + ''' format results ''' rule format_cirm_results: input: file='corpora/cirm/BIA_2021/florilege_export_final_17_02_21.tsv', - taxa='corpora/cirm/mapped_taxids.txt', - habitats='corpora/cirm/mapped_habitats.txt' + taxa='corpora/cirm/mapped_bia_taxa.txt', + habitats='corpora/cirm/mapped_bia_habitats.txt' output: - result='corpora/florilege/cirm/cirm-results.txt' + result='corpora/florilege/cirm/cirm-bia-results.txt' params: taxa_index='2', strain_index='1', @@ -170,4 +236,21 @@ rule format_cirm_yeast_results: conda: 'softwares/envs/obo-utils-env.yaml' shell: 'python softwares/Florilege/scripts/format-cirm-yeast-results.py --cirm {input.file} --taxa {input.taxa} --habitats {input.habitats} --taxa-index {params.taxa_index} --habitat-index {params.habitat_index} > {output.result}' +''' +format results (CIRM CFBP) +''' +rule format_cirm_cfbp_results: + input: + file='corpora/cirm/CFBP_2020/CFPB_22_sept_2020_Type.tsv', + taxa='corpora/cirm/mapped_cfbp_taxa.txt', + habitats='corpora/cirm/mapped_cfbp_habitats.txt' + output: + result='corpora/florilege/cirm/cirm-cfbp-results.txt' + params: + taxa_index='1', + strain_index='0', + habitat_index='9,10,14,23' + conda: 'softwares/envs/obo-utils-env.yaml' + shell: 'python softwares/Florilege/scripts/format-cirm-cfbp-results.py --cirm {input.file} --taxa {input.taxa} --habitats {input.habitats} --taxa-index {params.taxa_index} --strain-index {params.strain_index} --habitat-index {params.habitat_index} > {output.result}' + diff --git a/softwares/Florilege/scripts/format-cirm-cfbp-results.py b/softwares/Florilege/scripts/format-cirm-cfbp-results.py new file mode 100644 index 0000000000000000000000000000000000000000..0364f40085f0e69906ed00a37ed8494835b8cecf --- /dev/null +++ b/softwares/Florilege/scripts/format-cirm-cfbp-results.py @@ -0,0 +1,104 @@ +import re +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument('--cirm', action='store', default='corpora/cirm/CFBP_2020/CFPB_22_sept_2020_Type.tsv', help='CIRM file') +parser.add_argument('--taxa', action='store', default='corpora/cirm/mapped_cfbp_taxa.txt', help='mapped taxa file') +parser.add_argument('--habitats', action='store', default='corpora/cirm/mapped_cfbp_habitats.txt', help='mapped habitat file') +parser.add_argument('--taxa-index', action='store', default='1', help='index of taxa column') +parser.add_argument('--strain-index', action='store', default='0', help='index of strain column') +parser.add_argument('--habitat-index', action='store', default='9,10,14,23', help='index(es) of habitat column(s) (comma-delimited if multiple indexes') + +args = parser.parse_args() + +cirm_file = args.cirm +taxa_file = args.taxa +habitat_file = args.habitats +taxa_index = int(args.taxa_index) +strain_index = int(args.strain_index) +habitat_indexes = list(map(int,args.habitat_index.split(','))) + +habitat_dict = {} +hf = open(habitat_file, "r") +for line in hf: + line = line.rstrip() + p = re.compile(r'([^\t]+)\t(\S.+)') + m = p.match(line) + if(m): + habitat = m.group(1) + habitat_info = m.group(2) + if habitat in habitat_dict: + habitat_dict[habitat].add(habitat_info) + else: + habitat_dict[habitat] = {habitat_info} +hf.close() + +taxa_dict = {} +tf = open(taxa_file, "r") +for line in tf: + line = line.rstrip() + p = re.compile(r'([^\t]+)\t(\S.+)') + m = p.match(line) + if(m): + taxon = m.group(1) + taxon_info = m.group(2) + taxa_dict[taxon] = taxon_info +tf.close() + +def add_entry(mappings, habitat, concepts, taxid, taxon, name, path): + for concept in concepts: + surface_form, concept_id, concept_name, concept_path = concept.split('\t') + key = taxid+"-"+concept_id + if key in mappings: + mappings[key]['habitat']['surface'].add(habitat) + mappings[key]['taxon']['surface'].add(taxon) + else: + mappings[key] = {} + mappings[key]['habitat'] = {} + mappings[key]['taxon'] = {} + mappings[key]['habitat']['concept_id'] = concept_id + mappings[key]['habitat']['concept_name'] = concept_name + mappings[key]['habitat']['concept_path'] = concept_path + mappings[key]['taxon']['taxid'] = taxid + mappings[key]['taxon']['canonical_name'] = name + mappings[key]['taxon']['path'] = path + mappings[key]['habitat']['surface'] = {habitat} + mappings[key]['taxon']['surface'] = {taxon} + return mappings + + +unique_mappings = {} +cf = open(cirm_file, "r") +for num, line in enumerate(cf, 1): + if (num > 1): + line = line.rstrip("\n") + fields = line.split("\t") + taxon = re.sub(r'\s+', ' ', fields[taxa_index]).strip().replace('"','') + taxon2 = re.sub(r'( [0-9\.]+)$', r'', taxon) + strain = re.sub(r'\s+', ' ', fields[strain_index]).strip() + variants = [taxon + " CFBP " + strain, taxon + " CFBP" + strain, taxon + " CFBP:" + strain, taxon2 + " CFBP " + strain, taxon2 + " CFBP" + strain, taxon2 + " CFBP:" + strain, "CFBP " + strain, "CFBP" + strain, "CFBP:" + strain, taxon, taxon2] + matched = ""; + for variant in variants: + if variant in taxa_dict: + matched = variant + break + if (matched != ""): + taxid, name, path = taxa_dict[matched].split("\t") + for index in habitat_indexes: + habitat = re.sub(r'\.$','',re.sub(r'\s+', ' ', fields[index]).strip()) + if habitat in habitat_dict: + concepts = habitat_dict[habitat] + unique_mappings = add_entry(unique_mappings, habitat, concepts, taxid, matched, name, path) +cf.close() + +for mapping in unique_mappings.values(): + print("\t".join(('|'.join(mapping['taxon']['surface']), + mapping['taxon']['canonical_name'], + mapping['taxon']['taxid'], + mapping['taxon']['path'], + '|'.join(mapping['habitat']['surface']), + mapping['habitat']['concept_id'], + mapping['habitat']['concept_name'], + mapping['habitat']['concept_path']) + )) + diff --git a/softwares/Florilege/scripts/preprocess-cirm-cfbp.py b/softwares/Florilege/scripts/preprocess-cirm-cfbp.py new file mode 100644 index 0000000000000000000000000000000000000000..c24e2f5276bf4c67b317f7368c8956c3939c1f89 --- /dev/null +++ b/softwares/Florilege/scripts/preprocess-cirm-cfbp.py @@ -0,0 +1,51 @@ +import pandas as pd +import argparse +import re + +parser = argparse.ArgumentParser() +parser.add_argument('--input', action='store', default='corpora/cirm/CFBP_2020/CFPB_22_sept_2020_Type.xlsx', help='CIRM file') +parser.add_argument('--taxa-index', action='store', default='1', help='index of taxa column') +parser.add_argument('--strain-index', action='store', default='0', help='index of strain column') +parser.add_argument('--habitat-index', action='store', default='9,10,14,23', help='index(es) of habitat column(s) (comma-delimited if multiple indexes') +parser.add_argument('--taxa-outfile', action='store', default='corpora/cirm/cfbp_taxa.txt', help='output list of taxa') +parser.add_argument('--habitat-outfile', action='store', default='corpora/cirm/cfbp_habitats.txt', help='output list of habitats') +parser.add_argument('--tsv-outfile', action='store', default='corpora/cirm/CFBP_2020/CFPB_22_sept_2020_Type.tsv', help='output tab-delimited converted file') + +args = parser.parse_args() + +taxa_index = int(args.taxa_index) +strain_index = int(args.strain_index) +habitat_indexes = map(int,args.habitat_index.split(',')) + +cirm_data = pd.read_excel(args.input, dtype='object') + +taxa_dict = set() +for i in range(len(cirm_data)) : + taxon = str(cirm_data.iloc[i, taxa_index]) + strain = str(cirm_data.iloc[i, strain_index]) + taxon = taxon.replace('"','') + taxa_dict.add(taxon) + taxa_dict.add(taxon + " CFBP " + strain) + taxa_dict.add(taxon + " CFBP" + strain) + taxa_dict.add(taxon + " CFBP:" + strain) + taxon2 = re.sub(r'( [0-9\.]+)$', r'', taxon) + taxa_dict.add(taxon2) + taxa_dict.add(taxon2 + " CFBP " + strain) + taxa_dict.add(taxon2 + " CFBP" + strain) + taxa_dict.add(taxon2 + " CFBP:" + strain) + taxa_dict.add("CFBP " + strain) + taxa_dict.add("CFBP" + strain) + taxa_dict.add("CFBP:" + strain) + +f = open(args.taxa_outfile, 'w') +f.write("\n".join(taxa_dict) + "\n") +f.close() + +f = open(args.habitat_outfile, 'w') +for index in habitat_indexes: + habitats = cirm_data.iloc[:, index].dropna().unique() + for habitat in habitats: + f.write(re.sub(r'\.$', r'', habitat) + "\n") +f.close() + +cirm_data.to_csv(args.tsv_outfile, sep="\t", index=False)