RefgetStore

Example: Full GlobalRefgetStore Usage

import os
import tempfile
from gtars.refget import GlobalRefgetStore, StorageMode, digest_fasta, RetrievedSequence

def run_store_example():
    with tempfile.TemporaryDirectory() as temp_dir:
        # 1. Prepare a dummy FASTA file
        fasta_content = (
            ">chr1\n"
            "ATGCATGCATGCAGTCGTAGC\n"
            ">chr2\n"
            "GGGGAAAA\n"
        )
        source_fasta_path = os.path.join(temp_dir, "source.fa")
        with open(source_fasta_path, "w") as f:
            f.write(fasta_content)

        # 2. Digest the FASTA to get collection info and digest
        collection = digest_fasta(source_fasta_path)
        collection_digest = collection.digest
        print(f"Source FASTA digested. Collection digest: {collection_digest}\n")

        # 3. Initialize GlobalRefgetStore in Encoded mode
        store = GlobalRefgetStore(StorageMode.Encoded)
        print(f"Initialized store: {store}\n")

        # 4. Import FASTA into the store
        store.import_fasta(source_fasta_path)
        print("FASTA imported into the store.\n")

        # 5. Get a sequence by its ID (using the digest from the first sequence in collection)
        seq_digest_chr1 = collection[0].metadata.sha512t24u
        record_chr1 = store.get_sequence_by_id(seq_digest_chr1)
        if record_chr1:
            print(f"Retrieved sequence by ID: {record_chr1.metadata.name}, length {record_chr1.metadata.length}")
            # Note: record_chr1.data might be None if store mode is Encoded and data isn't decoded automatically
            print(f"  Sequence (full): {store.get_substring(seq_digest_chr1, 0, record_chr1.metadata.length)}\n")

        # 6. Get a substring
        sub_seq = store.get_substring(seq_digest_chr1, 5, 15)
        print(f"Substring from chr1[5:15]: {sub_seq}\n") # Expected: TGCAGTCGTA

        # 7. Prepare a BED file for region retrieval
        bed_content = (
            "chr1\t0\t10\n"
            "chr2\t2\t6\n"
            "chr_nonexistent\t0\t5\n" # This entry will be skipped
        )
        bed_path = os.path.join(temp_dir, "regions.bed")
        with open(bed_path, "w") as f:
            f.write(bed_content)

        # 8. Retrieve sequences from BED file to a list
        retrieved_list = store.get_seqs_bed_file_to_vec(collection_digest, bed_path)
        print("Retrieved sequences from BED file (as list):")
        for rs in retrieved_list:
            print(f"  - {rs}")
        print("\n")

        # 9. Retrieve sequences from BED file and write to new FASTA
        output_fasta_path = os.path.join(temp_dir, "output_regions.fa")
        store.get_seqs_bed_file(collection_digest, bed_path, output_fasta_path)
        print(f"Retrieved sequences from BED file written to: {output_fasta_path}")
        with open(output_fasta_path, "r") as f:
            print("Content of output FASTA:\n" + f.read())
        print("\n")

        # 10. Write store to a new directory
        saved_store_path = os.path.join(temp_dir, "my_refget_store")
        store.write_store_to_directory(saved_store_path, "{digest_prefix}/{digest}.gz") # Custom template
        print(f"Store saved to: {saved_store_path}\n")

        # 11. Load store from the directory
        loaded_store = GlobalRefgetStore.load_from_directory(saved_store_path)
        print(f"Store successfully loaded from: {saved_store_path}")

if __name__ == "__main__":
    run_store_example()