deeplearning4j: `WordVectorSerializer` cannot read gensim word2vec model

in gensim v2.3.0-py27

   model = Word2Vec(sentences,
                    size=self._wv_config.vector_size,
                    window=self._wv_config.window_size,
                    min_count=self._wv_config.min_count,
                    workers=self._wv_config.workers,
                    sg=int(self._wv_config.use_skip_kgram),
                    iter=self._wv_config.num_epoch)
   model.save(self._model_path)

in dl4j v0.9.1

  import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer
  WordVectorSerializer.readWord2VecModel(new java.io.File("../data/wordvec.model"))

error

Exception in thread "main" java.lang.RuntimeException: Unable to guess input file format. Please use corresponding loader directly
	at org.deeplearning4j.models.embeddings.loader.WordVectorSerializer.readWord2VecModel(WordVectorSerializer.java:2480)
	at org.deeplearning4j.models.embeddings.loader.WordVectorSerializer.readWord2VecModel(WordVectorSerializer.java:2266)
	at LoadWordVecModel$.delayedEndpoint$LoadWordVecModel$1(RunTensorflowModel.scala:47)
	at LoadWordVecModel$delayedInit$body.apply(RunTensorflowModel.scala:45)
	at scala.Function0.apply$mcV$sp(Function0.scala:34)
	at scala.Function0.apply$mcV$sp$(Function0.scala:34)
	at scala.runtime.AbstractFunction0.apply$mcV$sp(AbstractFunction0.scala:12)
	at scala.App.$anonfun$main$1$adapted(App.scala:76)
	at scala.collection.immutable.List.foreach(List.scala:389)
	at scala.App.main(App.scala:76)
	at scala.App.main$(App.scala:74)
	at LoadWordVecModel$.main(RunTensorflowModel.scala:45)
	at LoadWordVecModel.main(RunTensorflowModel.scala)

About this issue

  • Original URL
  • State: closed
  • Created 7 years ago
  • Comments: 15 (6 by maintainers)

Most upvoted comments

I’ve got the same problem:

Exception in thread “main” java.lang.RuntimeException: Unable to guess input file format. Please use corresponding loader directly

I solved it like this:

gensim (3.0.1, python 2.7.5): model.wv.save_word2vec_format("abc.bin.gz", binary=True)

in dl4j 0.9.1:

public static void main(String[] args) throws Exception {
    String filePath = "abc.bin.gz";
    File gModel = new File(filePath);
    Word2Vec vec = loadGoogleBinaryModel(gModel, false);
}

public static Word2Vec loadGoogleBinaryModel(File modelFile, boolean lineBreaks) throws IOException {
    return readBinaryModel(modelFile, lineBreaks, true);
}

private static Word2Vec readBinaryModel(File modelFile, boolean linebreaks, boolean normalize)
    throws NumberFormatException, IOException {
    InMemoryLookupTable<VocabWord> lookupTable;
    VocabCache<VocabWord> cache;
    INDArray syn0;
    int words, size;

    int originalFreq = Nd4j.getMemoryManager().getOccasionalGcFrequency();
    boolean originalPeriodic = Nd4j.getMemoryManager().isPeriodicGcActive();

    if (originalPeriodic)
        Nd4j.getMemoryManager().togglePeriodicGc(false);

    Nd4j.getMemoryManager().setOccasionalGcFrequency(50000);

    try (BufferedInputStream bis = new BufferedInputStream(GzipUtils.isCompressedFilename(modelFile.getName())
        ? new GZIPInputStream(new FileInputStream(modelFile)) : new FileInputStream(modelFile));
         DataInputStream dis = new DataInputStream(bis)) {
        words = Integer.parseInt(WordVectorSerializer.readString(dis));
        size = Integer.parseInt(WordVectorSerializer.readString(dis));
        syn0 = Nd4j.create(words, size);
        cache = new AbstractCache<>();

        WordVectorSerializer.printOutProjectedMemoryUse(words, size, 1);

        lookupTable = (InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().cache(cache)
            .useHierarchicSoftmax(false).vectorLength(size).build();

        String word;
        float[] vector = new float[size];
        for (int i = 0; i < words; i++) {

            word = WordVectorSerializer.readString(dis);
            log.trace("Loading " + word + " with word " + i);

            for (int j = 0; j < size; j++) {
                vector[j] = WordVectorSerializer.readFloat(dis);
            }

            syn0.putRow(i, normalize ? Transforms.unitVec(Nd4j.create(vector)) : Nd4j.create(vector));

            // FIXME There was an empty string in my test model ......
            if (StringUtils.isNotEmpty(word)) {
                VocabWord vw = new VocabWord(1.0, word);
                vw.setIndex(cache.numWords());

                cache.addToken(vw);
                cache.addWordToIndex(vw.getIndex(), vw.getLabel());

                cache.putVocabWord(word);
            }

            if (linebreaks) {
                dis.readByte(); // line break
            }

            Nd4j.getMemoryManager().invokeGcOccasionally();
        }
    } finally {
        if (originalPeriodic)
            Nd4j.getMemoryManager().togglePeriodicGc(true);

        Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq);
    }

    lookupTable.setSyn0(syn0);

    Word2Vec ret = new Word2Vec.Builder().useHierarchicSoftmax(false).resetModel(false).layerSize(syn0.columns())
        .allowParallelTokenization(true).elementsLearningAlgorithm(new SkipGram<VocabWord>())
        .learningRate(0.025).windowSize(5).workers(1).build();

    ret.setVocab(cache);
    ret.setLookupTable(lookupTable);

    return ret;
}