Personal tools

Webarc:PBS Jobs Scripts for Indexing

From Adapt

Revision as of 01:16, 10 November 2009 by Scsong (talk | contribs)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to: navigation, search

submit.sh

#!/bin/bash
#PBS -N waexp
#PBS -l walltime=48:00:00
#PBS -l nodes=5
#PBS -m be
#PBS -M scsong@gmail.com
#PBS -S /bin/bash
#PBS -V

HOST=`hostname -s`
PROGFILE=/vnodehomes/toaster/webarc/submit.prog
FIRSTINDEXNO=58

#
# Define function stageout
#
function stageout {
	DATE=`date +"%m/%d/%y %H:%M:%S"`
	echo "$HOST($DATE): SIGTERM caught. Stage out. Copying any existing outputs"
	for node in `cat ${PBS_NODEFILE}`; do
		echo "$HOST($DATE): Staging out $node"
		ssh $node "\rm -rf /scratch1/*" &
	done
}

#
# Trap SIGTERM, SIGKILL, SIGINT to invoke function stageout
#
trap 'stageout' 2 9 15


totalnodes=`cat ${PBS_NODEFILE} | wc -l`
echo "" > $PROGFILE
cat ${PBS_NODEFILE} > ~/pbs_nodefile

#
# Run runindex.sh at worker nodes
#
count=$FIRSTINDEXNO
for node in `cat ${PBS_NODEFILE}`; do
	if [ $count -eq $FIRSTINDEXNO ]; then # this is master node
		for (( i=$count; i<=83; i=i+$totalnodes )); do
			echo "$HOST($DATE): $node is being deployed" >> $PROGFILE
			~/webarc/runindex.sh $i &
		done
	else
		for (( i=$count; i<=83; i=i+$totalnodes )); do
			echo "$HOST($DATE): $node is being deployed" >> $PROGFILE
			ssh $node "~/webarc/runindex.sh $i" &
		done
	fi
	let count=count+1
done

#
# Monitor deploy status
#
PNAME="runindex.sh"
for node in `cat ${PBS_NODEFILE}`; do
	while ssh $node /sbin/pidof -x $PNAME > /dev/null; do
		DATE=`date +"%m/%d/%y %H:%M:%S"`
  		echo "$HOST($DATE): runindex.sh still running at $node" >> $PROGFILE
		sleep 300 #sleep 5 minutes
	done
	DATE=`date +"%m/%d/%y %H:%M:%S"`
	echo "$HOST($DATE): no existing runindex.sh" >> $PROGFILE
done

#
# Stage out all data
#
for node in `cat ${PBS_NODEFILE}`; do
	DATE=`date +"%m/%d/%y %H:%M:%S"`
	echo "$HOST($DATE): stage out data from $node" >> $PROGFILE
	ssh $node "\rm -rf /scratch1/*" &
done

runindex.sh

#!/usr/bin/bash

d=`printf "%03d" $1`
HOST=`hostname -s`
PROGFILE=/vnodehomes/toaster/webarc/prog/index-$d.$HOST
echo "" > $PROGFILE

cd /scratch1

## STAGE IN LIBRARIES ##
if [ ! -d /scratch1/lib ]; then 
	cp -r /fs/webarc3/data/wikipedia/lib /scratch1/
fi

## STAGE IN BDB FILE ##
DATE=`date +"%m/%d/%y %H:%M:%S"`
echo "$HOST($DATE): Staging in BDB File ($d)" >> $PROGFILE
if [ ! -d /scratch1/month-$d-co ]; then
	cp -r /fs/webarc3/data/wikipedia/bdb-monthly/month-$d-co /scratch1/
	chmod 755 /scratch1/month-$d-co
fi
#ln -s /fs/webarc3/data/wikipedia/bdb-monthly/month-$d-co /scratch1/

## STAGE IN XML FILE ##
DATE=`date +"%m/%d/%y %H:%M:%S"`
echo "$HOST($DATE): Staging in XML File ($d)" >> $PROGFILE
if [ ! -d /scratch1/preprocessed-monthly ]; then
	ln -s /fs/webarc3/data/wikipedia/preprocessed-monthly /scratch1/
	#cp -vr /fs/webarc3/data/wikipedia/preprocessed-monthly /scratch1/
	#chmod 644 /scratch1/preprocessed-monthly/trec-month-*.xml
	DATE=`date +"%m/%d/%y %H:%M:%S"`
	echo "$HOST($DATE): XML Copy Finished" >> $PROGFILE
	echo "$HOST($DATE): XML Copy Finished" > /scratch1/xml_copy_done
fi

## WAIT UNTIL XML COPY IS DONE ##
while [ ! -f /scratch1/xml_copy_done ]; do
	sleep 60;
done

## MAKE LOCAL PARAMETER FILES ##
DATE=`date +"%m/%d/%y %H:%M:%S"`
echo "$HOST($DATE): Making parameter files ($d)" >> $PROGFILE
outfile=/scratch1/month-$d.params
echo "<parameters>" > $outfile
echo "  <index>/scratch1/month-$d</index>" >> $outfile
echo "  <indexType>indri</indexType>" >> $outfile
echo "  <corpus>" >> $outfile
echo "    <path>/scratch1/preprocessed-monthly/trec-month-$d.xml</path>" >> $outfile
echo "    <class>trectext</class>" >> $outfile
echo "  </corpus>" >> $outfile
echo "</parameters>" >> $outfile

outfile=/scratch1/month-$d-co.params
echo "<parameters>" > $outfile
echo "  <index>/scratch1/month-$d</index>" >> $outfile
echo "  <indexType>indri</indexType>" >> $outfile
echo "  <corpus>" >> $outfile
echo "    <path>/scratch1/month-$d-co</path>" >> $outfile
echo "    <class>trectext_from_bdb</class>" >> $outfile
echo "  </corpus>" >> $outfile
echo "</parameters>" >> $outfile

## START INDEXING ##
DATE=`date +"%m/%d/%y %H:%M:%S"`
echo "$HOST($DATE): Indexing Carry-Overs ($d)" >> $PROGFILE
~/webarc/lemur-4.10/bin/IndriBuildIndex /scratch1/month-$d-co.params 2>&1 >> $PROGFILE 
DATE=`date +"%m/%d/%y %H:%M:%S"`
echo "$HOST($DATE): Indexing Fresh Docs ($d)" >> $PROGFILE
~/webarc/lemur-4.10/bin/IndriBuildIndex /scratch1/month-$d.params 2>&1 >> $PROGFILE


## COPY INDEX ##
DATE=`date +"%m/%d/%y %H:%M:%S"`
echo "$HOST($DATE): Copying Index ($d)" >> $PROGFILE
scp -i ~/webarc/id_rsa -r /scratch1/month-$d scsong@naraapp03:/fs/webarc3/data/wikipedia/lemur_index/monthly/  #scp due to privilege issue


## STAGE OUT LOCAL DATA ##
#DATE=`date +"%m/%d/%y %H:%M:%S"`
#echo "$HOST($DATE): Staging Out Local Data ($d)" >> $PROGFILE
#rm -f month-$d-co.params
#rm -f month-$d.params
#\rm -rf ./month-$d
#\rm -rf ./month-$d-co

DATE=`date +"%m/%d/%y %H:%M:%S"`
echo "$HOST($DATE): Index FINISHED!! ($d)" >> $PROGFILE