##########################################
# generate_abstraqt_dataset.sh
# 
# Edoardo Sarti
# 2026
# ########################################

# Generates the training/validation set for the ABSTRAQT scoring function
# The training set is equilibrated and divided into 5 sections:
#  1-2. Positive-structured: CATH domains (taken separately from and jointly with their original PDB structure)
#  3. Positive-disordered: DisProt/AlphaFold consensus intrinsically disordered residues
#  4. Negative-structured: Hallucinated secondary structures (detected by our algorithm
#  5. Negative-disordered: AlphaFold intrinsically disordered redisues that AIUPred classes as structured


#  Source environmen variables and check requirements
##########################################

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "${SCRIPT_DIR}/locations.sh"

bash ${SCRIPT_DIR}/check_requirements.sh


#  1-2. Positive-structured (CATH)
##########################################

# Generate CATH training list. Whole CATH domains are randomly selected
for x in $(awk -F"|" 'substr($0,1,1)==">"{print substr($3,1,4), substr($3,9)}' ${CATH_DIR}/cath-dataset-nonredundant-S20-v4_1_0.atom.fa | awk '{print $1}'); do ch=$(grep "^ATOM" ${CATH_DIR}/${x}.pdb | head -n1 | awk '{print substr($0,22,1)}'); echo "${x} ${ch}"; done > ${DB_DIR}/chains.tmp.txt

awk -F"|" 'substr($0,1,1)==">"{print substr($3,1,4), substr($3,9)}' ${CATH_DIR}/cath-dataset-nonredundant-S20-v4_1_0.atom.fa > ${DB_DIR}/ranges.tmp.txt

paste ${DB_DIR}/chains.tmp.txt ${DB_DIR}/ranges.tmp.txt > ${DB_DIR}/table.tmp.txt

awk -F"_" '{for (i=1;i<=NF;i++) {printf "%s ", $i}; print ""}' ${DB_DIR}/table.tmp.txt | awk '{for (i=4; i<=NF; i++) {split($i,a,"-"); for (j=a[1]+0;j<=a[2]+0;j++) {printf "%s.pdb\t%s\t%s\t1\n", $1, $2, j}}}' > ${DB_DIR}/training_list_cath.tsv

# Clean it from errors in chain
awk -OF"\t" 'length($2)==1' ${DB_DIR}/training_list_cath.tsv > ${DB_DIR}/training_list_cath_clean.tsv
rm ${DB_DIR}/chains.tmp.txt ${DB_DIR}/ranges.tmp.txt ${DB_DIR}/table.tmp.txt


# 3. Positive-disordered (DisProt)
##########################################

python3 ${SCRIPT_DIR}/compare-af2rsa-disprot.py > ${DB_DIR}/training_list_disprot.tsv 


# 4. Negative-structured (Geometric algorithm)
##########################################

# Generate report of implausible SS regions
python3 ${SCRIPT_DIR}/report_unphysical_SSEs.py  > ${DB_DIR}/training_list_SSreports.tsv 

# Transform report_on_all_human.tsv in a negative list
awk -v afdb_dir=${AFDB_DIR} '{for (i=$3;i<=$4;i++) {print afdb_dir "" $1 "\t" $2 "\t" i "\t" 0}}' ${DB_DIR}/report_on_all_human.tsv > ${DB_DIR}/training_list_SSreports.tsv


# 5. Negative-disordered (AIUPred)
##########################################

# AIUPred
python3 ${SCRIPT_DIR}/find-unconfirmed-disorder.py > ${DB_DIR}/training_list_unconfirmed_disorder_aiupred.tsv


# Generate the equilibrated dataset
##########################################

bash ${DB_DIR}/generate_equilibrated_dataset.sh
