tensorflow: Tensorflow Model with CTC loss having save and restore problem

I am using tensorflow 0.12 without GPU support. I was testing it with various models. My template structure is

#Load some data from file
graph=tf.Graph()
with graph.as_default():
     #Build Network
     #saver=tf.train.Saver()
with tf.Session(graph=graph) as session:
     if(sys.argv[1]=="load"):
          saver.restore(session,"weight_last")
     else:
           initop=tf.global_variables_initializer()
           session.run(initop)
    #Continue Training

Now, I am facing a strange issue. When I am creating a MLP or RNN with this structure with a categorical cross entropy loss model this saving and restoring is working perfectly, i.e. after restore the loss is showing exact value that was showed during last save. But unfortunately when the network is loaded with CTC loss then after restoring the model is starting almost a new training. I am not sure what is going wrong? Any help shall be highly appreciated.

About this issue

  • Original URL
  • State: closed
  • Created 7 years ago
  • Comments: 72 (19 by maintainers)

Most upvoted comments

I found my problem and the problem was shuffling before making dictionary… 😢

I also can confirm the issue: Whenever we are using the

    rnncell=tf.nn.rnn_cell.BasicLSTMCell(3,state_is_tuple=True)
    rnnop,_=tf.nn.dynamic_rnn(rnncell,x,dtype=tf.float32)#None,3,8

The tf.train.Saver or alternative does not save the weights of the LSTM cells properlly. This also seems to be the case of using

    #Base cell
    base_lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size) 

    # Add dropout to the cell
    drop = tf.contrib.rnn.DropoutWrapper(base_lstm, output_keep_prob = keep_prob)

    # Stack up multiple LSTM layers, for deep learning
    lstms_cells = tf.contrib.rnn.MultiRNNCell([drop]*2)

    # Getting an initial state of all zeros
    initial_state = lstms_cells.zero_state(batch_size, tf.float32)
    initial_state = tf.identity(initial_state, name="initial_state")

        
    rnn_out, final_state = tf.nn.dynamic_rnn(lstms_cells, inputs, dtype=tf.float32)

So right now it seems we have no reliable way to save and load LSTM based on tf.contrib.rnn.BasicLSTMCell

I train and store the model in python2, and restore it using python3, it got terrible result. But when I restore the model using python2, the result is good. Train and store in python3, I got awful result.

I have the same problem concerning the saving of rnn cells. I use a simple tf.nn.rnn_cell.BasicRNNCell for testing. Whenever I train my network stop the program and restart it to for example generate a sequence or further training (starting with the latest training state) it seems like the model was never trained before.

BUT when I train the model, save it and in the same run train or generate a text by restoring the model before executing the task it works! That is crazy!

I found the root cause. my input data vectors are modeled differently across each execution due to some issue in the data modeling part. Otherwise, no issue in tensor flow graph after restoration found. After fixing the input data model, I could able to retrain from the last checkpoint properly. I suggest cross checking the input data model for its consistency across multiple executions

@michaelisard I am extremely sorry for my delayed response. My input is taken from a H5 file which contains features extracted from online handwriting data sample. at every time step I have 16 features. Every time I read from this file the order of data is shuffled. Here is the part where I am creating the graph.

graph=tf.Graph()
with graph.as_default():
    print("Graph Creation")
    x=tf.placeholder(tf.float32,[None,ms,nb_features],name="x")

    y=tf.sparse_placeholder(tf.int32,name="y")
    seq_len=tf.placeholder(tf.int32,[None],name="seq_len")
    

    f_cell=tf.nn.rnn_cell.LSTMCell(nb_hidden,state_is_tuple=True)
    f_stack=tf.nn.rnn_cell.MultiRNNCell([f_cell]*nb_layers)
    
    b_cell=tf.nn.rnn_cell.LSTMCell(nb_hidden,state_is_tuple=True)
    b_stack=tf.nn.rnn_cell.MultiRNNCell([b_cell]*nb_layers)
    
    outputs,_=tf.nn.bidirectional_dynamic_rnn(f_stack,b_stack,x,sequence_length=seq_len,dtype=tf.float32)

    merge=tf.concat(2, outputs,name="merge")

    shape = tf.shape(x)
    batch_s,maxtimesteps=shape[0],shape[1]

    output_reshape = tf.reshape(merge, [-1, nb_hidden*2])#batch*timesteps,nb_hidden
   
    W = tf.Variable(tf.truncated_normal([nb_hidden*2,nb_classes],stddev=0.1),name="W1")

    b = tf.Variable(tf.constant(0., shape=[nb_classes]),name="b1")

    logits = tf.add(tf.matmul(output_reshape, W) , b,name="logits") #818622,52
      
    logits_reshape = tf.transpose(tf.reshape(logits, [batch_s, -1, nb_classes]),[1,0,2],name="logits_reshape")#534,1533,52

    loss =tf.nn.ctc_loss(logits_reshape, y, seq_len,time_major=True)
    cost = tf.reduce_mean(loss,name="cost")

    optimizer = tf.train.RMSPropOptimizer(lr).minimize(cost)

    decoded, log_prob = tf.nn.ctc_greedy_decoder(logits_reshape, seq_len)

    actual_ed=tf.edit_distance(tf.cast(decoded[0], tf.int32),y,normalize=False)
    ler = tf.reduce_sum(actual_ed,name="ler")
    saver=tf.train.Saver()
    bestsaver=tf.train.Saver()
    print("Network Ready")

Just after this, I am running the training and saving it

with tf.Session(graph=graph,config=tf.ConfigProto(log_device_placement=False)) as session:    
    #saver = tf.train.Saver()
    if(sys.argv[1]=="load"):
        saver.restore(session, "Weights/model_last")
        print("Previous weights loaded")
    else:
        init_op = tf.global_variables_initializer()
        session.run(init_op)
        print("New Weights Initialized")
    
    best=0
    acctest=0
    bestloss=10000
    #testfeed = {x:test_inpx[0],y:test_inp_sparse_y[0],seq_len:test_inpseqlen[0]}
    testcases=[0,5,17,39,60]
    true=[]
    for tr in range(len(testcases)):
        true1=label_from_sparse(test_inp_sparse_y[0],testcases[tr])
        true.append(true1)    
    print("Actual ",true)
    for e in range(nb_epochs):
        f=open(logfilename,"a")
        totalloss=0
        totalacc=0
        starttime=time.time()

        for b in range(trainbatch):
            p=b+1
            print("Reading Batch ",p,"/",trainbatch,end="\r")
            feed = {x:inpx[b],y:inp_sparse_y[b],seq_len:inpseqlen[b]}
            batchloss,batchacc, _ = session.run([cost,ler,optimizer], feed)
            
            totalloss=totalloss+batchloss
            totalacc=totalacc+batchacc
        avgloss=totalloss/trainbatch
        avgacc=1-(totalacc/nctr)
        if(avgloss<bestloss):
            bestloss=avgloss
            print("Network Improvement")
            saver.save(session, "Weights/model_last")
        
        testloss=0
        testacc=0
        for t in range(testbatch):
            testfeed = {x:test_inpx[t],y:test_inp_sparse_y[t],seq_len:test_inpseqlen[t]}
            outcome,testbatchloss,testbatchacc=session.run([decoded[0],cost,ler],testfeed)
            if(t==0):
                first_batch_outcome=outcome
            testloss=testloss+testbatchloss
            testacc=testacc+testbatchacc
        
        testfile=open("Results.txt","w")
        testfile.write("Epoch "+str(e)+"\n")
        for tc in range(len(testcases)):
            predicted=label_from_sparse(first_batch_outcome,testcases[tc])
            testfile.write(str(true[tc])+" As "+str(predicted)+"\n")
        testfile.close()
        
        testloss=testloss/testbatch
        testacc=1-(testacc/ncts)
        
        endtime=time.time()        
        if(testacc>best):
            best=testacc
            print("Test Result Improvement")
            bestsaver.save(session, "BestWeights/model_best")
        timetaken=endtime-starttime
        msg="Epoch "+str(e)+"("+str(timetaken)+ " sec ) Training: Cost is "+str(avgloss)+" Accuracy "+str(avgacc)+" Testing: Loss "+str(testloss)+" Accuracy "+str(testacc)+"\n"
        print(msg)
        f.write(msg)
        f.close()

Now, whenever I am loading the model from last save or best save, It is not showing any sign of previous training. Seems to be starting from some scratch. I also tried import_meta_graph() without any success. But the same strategy is working absolutely fine with a RNN model which is tested against the well known IRIS data set (hence a classification problem).

mygraph=tf.Graph()

with mygraph.as_default():
    x=tf.placeholder(tf.float32,[None,4,1])
    y=tf.placeholder(tf.float32,[None,3])
    
    rnncell=tf.nn.rnn_cell.BasicLSTMCell(3,state_is_tuple=True)
    rnnop,_=tf.nn.dynamic_rnn(rnncell,x,dtype=tf.float32)#None,3,8
    
    shape=tf.shape(rnnop)
    batch=shape[0]
    op=shape[1]*shape[2]
    rnnrs=tf.reshape(rnnop,[batch,op],name="rnnrs")
    
    w1=tf.Variable(tf.truncated_normal([12,5]),name="w1")
    b1=tf.Variable(tf.truncated_normal([5]),name="b1")
    
    layer1=tf.add(tf.matmul(rnnrs,w1),b1)
    layer1_op=tf.nn.tanh(layer1)
    
    w2=tf.Variable(tf.truncated_normal([5,6]),name="w2")
    b2=tf.Variable(tf.truncated_normal([6]),name="b2")
    
    layer2=tf.add(tf.matmul(layer1_op,w2),b2)
    layer2_op=tf.nn.tanh(layer2)
    
    w3=tf.Variable(tf.truncated_normal([6,3]),name="w3")
    b3=tf.Variable(tf.truncated_normal([3]),name="b3")
    
    layer3=tf.add(tf.matmul(layer2_op,w3),b3)
    prediction=layer3
    
    correct=tf.equal(tf.arg_max(prediction,1),tf.arg_max(y,1))
    acc=tf.reduce_mean(tf.cast(correct,tf.float32))
    #loss=tf.reduce_mean(tf.reduce_sum(tf.square(tf.sub(y,prediction))))
    loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(prediction,y))
    optimizer=tf.train.MomentumOptimizer(0.01,0.9).minimize(loss)
    saver=tf.train.Saver()
    print("Network Ready")

with tf.Session(graph=mygraph) as session:
    feedx,feedy=loadiris("/media/parthosarothi/OHWR/Dataset/iris.csv")
    print("Data 0 x=",feedx[0]," y=",feedy[0])
    feed={x:feedx,y:feedy}
    if(sys.argv[1]=="load"):
        saver.restore(session,"Weights/last")
        print("Previous Weights Loaded")
    else:
        initop=tf.global_variables_initializer()
        session.run(initop)
        print("New Weights Loaded")
    for e in range(100):
        l,_,p,a=session.run([loss,optimizer,prediction,acc],feed)
        print("Loss is ",l," P ",p[0]," y ",feedy[0]," A ",a)
        saver.save(session,"Weights/last")

I am completely in dark. Your concern is highly appreciated.