cpr / scripts /embed_fasta.sh
LoocasGoose's picture
new clean branch
3a8e9de
#!/bin/bash
# Set the CUDA device
CUDA_DEVICE=3
# Specify the directory with all fasta files (default is current directory), assuming
# all protein fasta files are named *_prot.fasta
INPUT_DIR="${1:-.}"
# Create the "emb" subfolder if it doesn't exist
EMB_DIR="$INPUT_DIR/emb"
mkdir -p "$EMB_DIR"
# Loop through all *_prot.fasta files, embed
for fasta_file in "$INPUT_DIR"/*_prot.fasta; do
# Ensure the file exists
if [[ -f "$fasta_file" ]]; then
# Extract base filename without extension
base_name=$(basename "$fasta_file" "_prot.fasta")
# Set output file name inside emb/ folder
output_file="$EMB_DIR/${base_name}_emb.npy"
# Run embedding command
echo "Processing: $fasta_file -> $output_file"
cd /home/yangk/proteins/protein-vec/src_run
CUDA_VISIBLE_DEVICES=$CUDA_DEVICE python embed_seqs.py --input_file "$fasta_file" --output_file "$output_file"
fi
done
echo "All files processed. Embeddings saved in $EMB_DIR."
#!/bin/bash
# Set the CUDA device
CUDA_DEVICE=3
# Specify the directory with all fasta files (default is current directory), assuming
# all protein fasta files are named *_prot.fasta
INPUT_DIR="${1:-.}"
# Create the "emb" subfolder if it doesn't exist
EMB_DIR="$INPUT_DIR/emb"
mkdir -p "$EMB_DIR"
# Loop through all *_prot.fasta files, embed
for fasta_file in "$INPUT_DIR"/*_prot.fasta; do
# Ensure the file exists
if [[ -f "$fasta_file" ]]; then
# Extract base filename without extension
base_name=$(basename "$fasta_file" "_prot.fasta")
# Set output file name inside emb/ folder
output_file="$EMB_DIR/${base_name}_emb.npy"
# Run embedding command
echo "Processing: $fasta_file -> $output_file"
cd /home/yangk/proteins/protein-vec/src_run
CUDA_VISIBLE_DEVICES=$CUDA_DEVICE python embed_seqs.py --input_file "$fasta_file" --output_file "$output_file"
fi
done
echo "All files processed. Embeddings saved in $EMB_DIR."