Personal tools

Webarc:Berkeley DB Wrapper for Carryover DB: Difference between revisions

From Adapt

Jump to: navigation, search
No edit summary
 
No edit summary
 
(One intermediate revision by the same user not shown)
Line 12: Line 12:


== Usage ==
== Usage ==
Example Usage
Example C++ Code
<pre>
<pre>
/*==========================================================================
* Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
*
* Use of the Lemur Toolkit for Language Modeling and Information Retrieval
* is subject to the terms of the software license set forth in the LICENSE
* file included with this software, and also available at
* http://www.lemurproject.org/license.html
*
*==========================================================================
*/


//
//
Line 38: Line 27:
#include <fstream>
#include <fstream>
#include <jni.h>
#include <jni.h>
#include "indri/DocumentIterator.hpp"
#include "indri/Buffer.hpp"
#include "indri/UnparsedDocument.hpp"


class BDBTaggedDocumentIterator : public DocumentIterator {
private:
  UnparsedDocument _document;
  FILE *_in;
  JavaVM* _jvm;
  JNIEnv* _jniEnv;
  jobject _bdb;
  jclass _clsRevisionDatabase;
  jclass _clsRevisionData;
  jmethodID _mid_RevisionDatabase_getNext;
  jmethodID _mid_RevisionDatabase_construct;
  jmethodID _mid_RevisionDatabase_close;
  jfieldID _fid_RevisionData_date;
  jfieldID _fid_RevisionData_fileName;
  jfieldID _fid_RevisionData_offset;
 
  void _create_vm();


namespace indri
  class RevisionData {
{
  private:
namespace parse
      JNIEnv* _rdenv;
{
  public:
      long date;
      const char* filename;
      long offset;
      RevisionData(JNIEnv *env, jobject obj, jfieldID date, jfieldID fname, jfieldID offset);
      ~RevisionData();
  };
  void _openDB(const char* dbName);
  void _closeDB();
  RevisionData* _getNextDocument();
  UnparsedDocument* _nextDocument();


class BDBTaggedDocumentIterator : public DocumentIterator {
public:
private:
  BDBTaggedDocumentIterator();
UnparsedDocument _document;
  ~BDBTaggedDocumentIterator();
// std::ifstream _mfin;
  void setTags( const char* startDoc, const char* endDoc, const char* endMetadata );
FILE *_in;
  void open( const std::string& filename );
indri::utility::Buffer _buffer;
  void close();
indri::utility::Buffer _metaBuffer;
std::string _lastMetadataTag;
char* _fileName;
// std::string _bdbName;


bool _readLine( char*& beginLine, size_t& lineLength );
  UnparsedDocument* nextDocument();


const char* _startDocTag;
};
const char* _endDocTag;
const char* _endMetadataTag;
JavaVM* _jvm;
JNIEnv* _jniEnv;
jobject _bdb;
jclass _clsRevisionDatabase;
jclass _clsRevisionData;
jmethodID _mid_RevisionDatabase_getNext;
jmethodID _mid_RevisionDatabase_construct;
jmethodID _mid_RevisionDatabase_close;
jfieldID _fid_RevisionData_date;
jfieldID _fid_RevisionData_fileName;
jfieldID _fid_RevisionData_offset;


int _startDocTagLength;
void indri::parse::BDBTaggedDocumentIterator::_create_vm() {
int _endDocTagLength;
  JavaVMInitArgs vm_args;
int _endMetadataTagLength;
  JavaVMOption options[2];
 
  /* For LINUX/UNIX */
  string opt1("-Djava.class.path=./lib/mwbdbwrap.jar:./lib/je-3.3.87.jar");


void _create_vm();
  /* For Windows */
  //string opt1("-Djava.class.path=.\\lib\\mwbdbwrap.jar;.\\lib\\je-3.3.87.jar");


class RevisionData {
  string opt2("-verbose:class");
private:
  options[0].optionString = (char*)(opt1.c_str());
JNIEnv* _rdenv;
  options[1].optionString = (char*)(opt2.c_str());
public:
  vm_args.version = JNI_VERSION_1_6; //JDK version. This indicates version 1.6
long date;
  vm_args.nOptions = 1; // Change this to 2 for verbosity
const char* filename;
  vm_args.options = options;
long offset;
  vm_args.ignoreUnrecognized = 0;
RevisionData(JNIEnv *env, jobject obj, jfieldID date, jfieldID fname, jfieldID offset);
  int ret = JNI_CreateJavaVM(&_jvm, (void**)&_jniEnv, &vm_args);
~RevisionData();
  if(ret < 0)
};
      printf("\nUnable to Launch JVM\n");
void _openDB(const char* dbName);
void _closeDB();
RevisionData* _getNextDocument();
UnparsedDocument* _nextDocument();


public:
  _clsRevisionDatabase = jniEnv->FindClass("edu/umd/umiacs/mw/bdb/RevisionDatabase");
BDBTaggedDocumentIterator();
  if (_jniEnv->ExceptionCheck()) {
~BDBTaggedDocumentIterator();
      _jniEnv->ExceptionDescribe();
  }
  _mid_RevisionDatabase_construct = _jniEnv->GetMethodID(_clsRevisionDatabase, "<init>", "(Ljava/lang/String;)V");
  _mid_RevisionDatabase_close = _jniEnv->GetMethodID(_clsRevisionDatabase, "close", "()V");
  _mid_RevisionDatabase_getNext = _jniEnv->GetMethodID(_clsRevisionDatabase, "getNext", "()Ledu/umd/umiacs/mw/bdb/RevisionData;");
 
  _clsRevisionData = _jniEnv->FindClass("edu/umd/umiacs/mw/bdb/RevisionData");
  _fid_RevisionData_date = _jniEnv->GetFieldID(_clsRevisionData, "date", "J");
  _fid_RevisionData_fileName = _jniEnv->GetFieldID(_clsRevisionData, "fileName", "Ljava/lang/String;");
  _fid_RevisionData_offset= _jniEnv->GetFieldID(_clsRevisionData, "offset", "J");
}
 
indri::parse::BDBTaggedDocumentIterator::RevisionData::RevisionData(JNIEnv *env, jobject obj, jfieldID fidDate, jfieldID fidFname, jfieldID fidOffset) {
  _rdenv = env;
  date = env->GetLongField(obj, fidDate);
  jstring jsfilename =  (jstring)env->GetObjectField(obj, fidFname);
  filename = env->GetStringUTFChars(jsfilename, 0);
  offset = env->GetLongField(obj, fidOffset);
}


void setTags( const char* startDoc, const char* endDoc, const char* endMetadata );
indri::parse::BDBTaggedDocumentIterator::RevisionData::~RevisionData() {
  //  _rdenv->ReleaseStringUTFChars(jsfilename, filename);


void open( const std::string& filename );
}
void close();


UnparsedDocument* nextDocument();
void indri::parse::BDBTaggedDocumentIterator::_openDB(const char* dbName) {
  jstring jsdbName = _jniEnv->NewStringUTF(dbName);
  _bdb = _jniEnv->NewObject(_clsRevisionDatabase, _mid_RevisionDatabase_construct, jsdbName);
}


};
void indri::parse::BDBTaggedDocumentIterator::_closeDB() {
}
  _jniEnv->CallObjectMethod(_bdb, _mid_RevisionDatabase_close);
}
}


#endif // INDRI_TRECDOCUMENTITERATOR_BDB_HPP


indri::parse::BDBTaggedDocumentIterator::RevisionData* indri::parse::BDBTaggedDocumentIterator::_getNextDocument() {
  RevisionData *revData = NULL;
  jobject joRevData = _jniEnv->CallObjectMethod(_bdb, _mid_RevisionDatabase_getNext);
  //if (_jniEnv->ExceptionCheck()) {
  //  _jniEnv->ExceptionDescribe();
  //}
  if (joRevData != NULL) {
      revData = new RevisionData(_jniEnv, joRevData, _fid_RevisionData_date, _fid_RevisionData_fileName, _fid_RevisionData_offset);
  }
  return revData;
}
</pre>
</pre>


== Output Files ==
Under the same directory under which Fresh DBs are located, new directories for Carryover DBs are generated. The new directories are named by concatenating '-co' at the end of Merge DB names. I.e. given a month, if Merge DB name is <month-003>, Carryover DB name will be given as <month-003-co>.
== Notes ==
* Make sure that the jar file (je-3.3.87.jar for example) for Java Berkeley DB is reachable (via CLASSPATH for example) when using this wrapper.


== Source Codes ==
== Source Codes ==
svn co http://narasvn.umiacs.umd.edu/repository/src/webarc/colstate
svn co http://narasvn.umiacs.umd.edu/repository/src/webarc/mwbdbwrap

Latest revision as of 23:32, 9 November 2009

What It Does

Two wrapper classes to be used by C/C++ codes via JNI.

How To Build

In Eclipse, export 'mwbdbwrap' as a JAR.

  1. Right-click on 'mwbdbwrap' in Package Explorer, select 'export'.
  2. Select mwbdbwrap/src (should have been already selected).
  3. Put <your directory>/mwbdbwrap.jar in Export destination.
  4. Select 'Export generated class files and resources'
  5. Select 'Add directory entries' in options
  6. Click 'Finish'

Usage

Example C++ Code


//
// BDBTaggedDocumentIterator
//
// 22 September 2009 -- scsong
//

#ifndef INDRI_TRECDOCUMENTITERATOR_BDB_HPP
#define INDRI_TRECDOCUMENTITERATOR_BDB_HPP

#include <string>
#include <fstream>
#include <jni.h>

class BDBTaggedDocumentIterator : public DocumentIterator {
private:
   UnparsedDocument _document;
   FILE *_in;
   JavaVM* _jvm;
   JNIEnv* _jniEnv;
   jobject _bdb;
   jclass _clsRevisionDatabase;
   jclass _clsRevisionData;
   jmethodID _mid_RevisionDatabase_getNext;
   jmethodID _mid_RevisionDatabase_construct;
   jmethodID _mid_RevisionDatabase_close;
   jfieldID _fid_RevisionData_date;
   jfieldID _fid_RevisionData_fileName;
   jfieldID _fid_RevisionData_offset;
   
   void _create_vm();

   class RevisionData {
   private:
      JNIEnv* _rdenv;
   public:
      long date;
      const char* filename;
      long offset;
      RevisionData(JNIEnv *env, jobject obj, jfieldID date, jfieldID fname, jfieldID offset);
      ~RevisionData();
   };
   void _openDB(const char* dbName);
   void _closeDB();
   RevisionData* _getNextDocument();
   UnparsedDocument* _nextDocument();

public:
   BDBTaggedDocumentIterator();
   ~BDBTaggedDocumentIterator();
   void setTags( const char* startDoc, const char* endDoc, const char* endMetadata );
   void open( const std::string& filename );
   void close();

   UnparsedDocument* nextDocument();

};

void indri::parse::BDBTaggedDocumentIterator::_create_vm() {
   JavaVMInitArgs vm_args;
   JavaVMOption options[2];
   
   /* For LINUX/UNIX */
   string opt1("-Djava.class.path=./lib/mwbdbwrap.jar:./lib/je-3.3.87.jar");

   /* For Windows */
   //string opt1("-Djava.class.path=.\\lib\\mwbdbwrap.jar;.\\lib\\je-3.3.87.jar");

   string opt2("-verbose:class");
   options[0].optionString = (char*)(opt1.c_str());
   options[1].optionString = (char*)(opt2.c_str());
   vm_args.version = JNI_VERSION_1_6; //JDK version. This indicates version 1.6
   vm_args.nOptions = 1; // Change this to 2 for verbosity
   vm_args.options = options;
   vm_args.ignoreUnrecognized = 0;
   int ret = JNI_CreateJavaVM(&_jvm, (void**)&_jniEnv, &vm_args);
   if(ret < 0)
      printf("\nUnable to Launch JVM\n");

   _clsRevisionDatabase = jniEnv->FindClass("edu/umd/umiacs/mw/bdb/RevisionDatabase");
   if (_jniEnv->ExceptionCheck()) {
      _jniEnv->ExceptionDescribe();
   }
   _mid_RevisionDatabase_construct = _jniEnv->GetMethodID(_clsRevisionDatabase, "<init>", "(Ljava/lang/String;)V");
   _mid_RevisionDatabase_close = _jniEnv->GetMethodID(_clsRevisionDatabase, "close", "()V");
   _mid_RevisionDatabase_getNext = _jniEnv->GetMethodID(_clsRevisionDatabase, "getNext", "()Ledu/umd/umiacs/mw/bdb/RevisionData;");

   _clsRevisionData = _jniEnv->FindClass("edu/umd/umiacs/mw/bdb/RevisionData");
   _fid_RevisionData_date = _jniEnv->GetFieldID(_clsRevisionData, "date", "J");
   _fid_RevisionData_fileName = _jniEnv->GetFieldID(_clsRevisionData, "fileName", "Ljava/lang/String;");
   _fid_RevisionData_offset= _jniEnv->GetFieldID(_clsRevisionData, "offset", "J");
}

indri::parse::BDBTaggedDocumentIterator::RevisionData::RevisionData(JNIEnv *env, jobject obj, jfieldID fidDate, jfieldID fidFname, jfieldID fidOffset) {
   _rdenv = env;
   date = env->GetLongField(obj, fidDate);
   jstring jsfilename =  (jstring)env->GetObjectField(obj, fidFname);
   filename = env->GetStringUTFChars(jsfilename, 0);
   offset = env->GetLongField(obj, fidOffset);
}

indri::parse::BDBTaggedDocumentIterator::RevisionData::~RevisionData() {
   //   _rdenv->ReleaseStringUTFChars(jsfilename, filename);

}

void indri::parse::BDBTaggedDocumentIterator::_openDB(const char* dbName) {
   jstring jsdbName = _jniEnv->NewStringUTF(dbName);
   _bdb = _jniEnv->NewObject(_clsRevisionDatabase, _mid_RevisionDatabase_construct, jsdbName);
}

void indri::parse::BDBTaggedDocumentIterator::_closeDB() {
   _jniEnv->CallObjectMethod(_bdb, _mid_RevisionDatabase_close);
}


indri::parse::BDBTaggedDocumentIterator::RevisionData* indri::parse::BDBTaggedDocumentIterator::_getNextDocument() {
   RevisionData *revData = NULL;
   jobject joRevData = _jniEnv->CallObjectMethod(_bdb, _mid_RevisionDatabase_getNext);
   //if (_jniEnv->ExceptionCheck()) {
   //   _jniEnv->ExceptionDescribe();
   //}

   if (joRevData != NULL) {
      revData = new RevisionData(_jniEnv, joRevData, _fid_RevisionData_date, _fid_RevisionData_fileName, _fid_RevisionData_offset);
   }
   return revData;
}


Source Codes

svn co http://narasvn.umiacs.umd.edu/repository/src/webarc/mwbdbwrap