Webarc:PBS Jobs Scripts for Indexing
From Adapt
submit.sh
#!/bin/bash #PBS -N waexp #PBS -l walltime=48:00:00 #PBS -l nodes=5 #PBS -m be #PBS -M scsong@gmail.com #PBS -S /bin/bash #PBS -V HOST=`hostname -s` PROGFILE=/vnodehomes/toaster/webarc/submit.prog FIRSTINDEXNO=58 # # Define function stageout # function stageout { DATE=`date +"%m/%d/%y %H:%M:%S"` echo "$HOST($DATE): SIGTERM caught. Stage out. Copying any existing outputs" for node in `cat ${PBS_NODEFILE}`; do echo "$HOST($DATE): Staging out $node" ssh $node "\rm -rf /scratch1/*" & done } # # Trap SIGTERM, SIGKILL, SIGINT to invoke function stageout # trap 'stageout' 2 9 15 totalnodes=`cat ${PBS_NODEFILE} | wc -l` echo "" > $PROGFILE cat ${PBS_NODEFILE} > ~/pbs_nodefile # # Run runindex.sh at worker nodes # count=$FIRSTINDEXNO for node in `cat ${PBS_NODEFILE}`; do if [ $count -eq $FIRSTINDEXNO ]; then # this is master node for (( i=$count; i<=83; i=i+$totalnodes )); do echo "$HOST($DATE): $node is being deployed" >> $PROGFILE ~/webarc/runindex.sh $i & done else for (( i=$count; i<=83; i=i+$totalnodes )); do echo "$HOST($DATE): $node is being deployed" >> $PROGFILE ssh $node "~/webarc/runindex.sh $i" & done fi let count=count+1 done # # Monitor deploy status # PNAME="runindex.sh" for node in `cat ${PBS_NODEFILE}`; do while ssh $node /sbin/pidof -x $PNAME > /dev/null; do DATE=`date +"%m/%d/%y %H:%M:%S"` echo "$HOST($DATE): runindex.sh still running at $node" >> $PROGFILE sleep 300 #sleep 5 minutes done DATE=`date +"%m/%d/%y %H:%M:%S"` echo "$HOST($DATE): no existing runindex.sh" >> $PROGFILE done # # Stage out all data # for node in `cat ${PBS_NODEFILE}`; do DATE=`date +"%m/%d/%y %H:%M:%S"` echo "$HOST($DATE): stage out data from $node" >> $PROGFILE ssh $node "\rm -rf /scratch1/*" & done
runindex.sh
#!/usr/bin/bash d=`printf "%03d" $1` HOST=`hostname -s` PROGFILE=/vnodehomes/toaster/webarc/prog/index-$d.$HOST echo "" > $PROGFILE cd /scratch1 ## STAGE IN LIBRARIES ## if [ ! -d /scratch1/lib ]; then cp -r /fs/webarc3/data/wikipedia/lib /scratch1/ fi ## STAGE IN BDB FILE ## DATE=`date +"%m/%d/%y %H:%M:%S"` echo "$HOST($DATE): Staging in BDB File ($d)" >> $PROGFILE if [ ! -d /scratch1/month-$d-co ]; then cp -r /fs/webarc3/data/wikipedia/bdb-monthly/month-$d-co /scratch1/ chmod 755 /scratch1/month-$d-co fi #ln -s /fs/webarc3/data/wikipedia/bdb-monthly/month-$d-co /scratch1/ ## STAGE IN XML FILE ## DATE=`date +"%m/%d/%y %H:%M:%S"` echo "$HOST($DATE): Staging in XML File ($d)" >> $PROGFILE if [ ! -d /scratch1/preprocessed-monthly ]; then ln -s /fs/webarc3/data/wikipedia/preprocessed-monthly /scratch1/ #cp -vr /fs/webarc3/data/wikipedia/preprocessed-monthly /scratch1/ #chmod 644 /scratch1/preprocessed-monthly/trec-month-*.xml DATE=`date +"%m/%d/%y %H:%M:%S"` echo "$HOST($DATE): XML Copy Finished" >> $PROGFILE echo "$HOST($DATE): XML Copy Finished" > /scratch1/xml_copy_done fi ## WAIT UNTIL XML COPY IS DONE ## while [ ! -f /scratch1/xml_copy_done ]; do sleep 60; done ## MAKE LOCAL PARAMETER FILES ## DATE=`date +"%m/%d/%y %H:%M:%S"` echo "$HOST($DATE): Making parameter files ($d)" >> $PROGFILE outfile=/scratch1/month-$d.params echo "<parameters>" > $outfile echo " <index>/scratch1/month-$d</index>" >> $outfile echo " <indexType>indri</indexType>" >> $outfile echo " <corpus>" >> $outfile echo " <path>/scratch1/preprocessed-monthly/trec-month-$d.xml</path>" >> $outfile echo " <class>trectext</class>" >> $outfile echo " </corpus>" >> $outfile echo "</parameters>" >> $outfile outfile=/scratch1/month-$d-co.params echo "<parameters>" > $outfile echo " <index>/scratch1/month-$d</index>" >> $outfile echo " <indexType>indri</indexType>" >> $outfile echo " <corpus>" >> $outfile echo " <path>/scratch1/month-$d-co</path>" >> $outfile echo " <class>trectext_from_bdb</class>" >> $outfile echo " </corpus>" >> $outfile echo "</parameters>" >> $outfile ## START INDEXING ## DATE=`date +"%m/%d/%y %H:%M:%S"` echo "$HOST($DATE): Indexing Carry-Overs ($d)" >> $PROGFILE ~/webarc/lemur-4.10/bin/IndriBuildIndex /scratch1/month-$d-co.params 2>&1 >> $PROGFILE DATE=`date +"%m/%d/%y %H:%M:%S"` echo "$HOST($DATE): Indexing Fresh Docs ($d)" >> $PROGFILE ~/webarc/lemur-4.10/bin/IndriBuildIndex /scratch1/month-$d.params 2>&1 >> $PROGFILE ## COPY INDEX ## DATE=`date +"%m/%d/%y %H:%M:%S"` echo "$HOST($DATE): Copying Index ($d)" >> $PROGFILE scp -i ~/webarc/id_rsa -r /scratch1/month-$d scsong@naraapp03:/fs/webarc3/data/wikipedia/lemur_index/monthly/ #scp due to privilege issue ## STAGE OUT LOCAL DATA ## #DATE=`date +"%m/%d/%y %H:%M:%S"` #echo "$HOST($DATE): Staging Out Local Data ($d)" >> $PROGFILE #rm -f month-$d-co.params #rm -f month-$d.params #\rm -rf ./month-$d #\rm -rf ./month-$d-co DATE=`date +"%m/%d/%y %H:%M:%S"` echo "$HOST($DATE): Index FINISHED!! ($d)" >> $PROGFILE