BERTopic: Invalid request using OpenAI backend.

It always returns an invalid request error when using the OpenAI model. I am not sure why this happens. Any idea? @MaartenGr

InvalidRequestError(message="'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", param=None, code=None, http_status=400, request_id=None)

My code is:

import gensim.corpora as corpora
import pandas as pd
import openai
import wandb
import os

from gensim.parsing.preprocessing import strip_punctuation
# from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.coherencemodel import CoherenceModel
# from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.backend import OpenAIBackend

# from bertopic.representation import KeyBERTInspired
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP

path_output = os.path.join(os.getcwd(), 'Result', 'RQ1', 'Special Topics')
path_model = os.path.join(os.getcwd(), 'Code', 'RQ1', 'Special Topic Modeling', 'Model')
if not os.path.exists(path_model):
    os.makedirs(path_model)

wandb_project = 'asset-management-topic-modeling'
openai.api_key = os.getenv('OPENAI_API_KEY')

os.environ["WANDB_API_KEY"] = 'xxxxxx'
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["WANDB__SERVICE_WAIT"] = "100"

# set default sweep configuration
config_defaults = {
    # Refer to https://platform.openai.com/docs/models/embeddings
    'model_name': 'text-embedding-ada-002',
    'metric_distane': 'cosine',
    'calculate_probabilities': True,
    'reduce_frequent_words': True,
    'prediction_data': True,
    'low_memory': False,
    'random_state': 42,
    'ngram_range': 2,
}

config_sweep = {
    'method': 'grid',
    'metric': {
        'name': 'Coherence CV',
        'goal': 'maximize'
    },
    'parameters': {
        'n_components': {
            'values': list(range(3,11)),
        },
    }
}


class TopicModeling:
    def __init__(self, topic_type, min_cluster_size=20):
        # Initialize an empty list to store top models
        self.top_models = []
        self.path_model = path_model
        
        df = pd.read_json(os.path.join(path_output, 'labels.json'))
        if topic_type == 'anomaly':
            df = df[df['Challenge_type'] == 'anomaly']
            self.docs = df[df['Challenge_summary'] != 'na']['Challenge_summary'].tolist() + df[df['Challenge_root_cause'] != 'na']['Challenge_root_cause'].tolist()
        elif topic_type == 'solution':
            docs = df[df['Solution'] != 'na']['Solution'].tolist()
            self.docs = [strip_punctuation(doc) for doc in docs]
        
        config_defaults['min_cluster_size'] = min_cluster_size
        config_sweep['name'] = topic_type
        config_sweep['parameters']['min_samples'] = {
            'values': list(range(1, config_defaults['min_cluster_size'] + 1))
        }
        
    def __train(self):
        # Initialize a new wandb run
        with wandb.init() as run:
            # update any values not set by sweep
            run.config.setdefaults(config_defaults)

            # Step 1 - Extract embeddings
            embedding_model = OpenAIBackend("text-embedding-ada-002")

            # Step 2 - Reduce dimensionality
            umap_model = UMAP(n_components=wandb.config.n_components, metric=run.config.metric_distane,
                              random_state=run.config.random_state, low_memory=run.config.low_memory)

            # Step 3 - Cluster reduced embeddings
            hdbscan_model = HDBSCAN(min_cluster_size=run.config.min_cluster_size,
                                    min_samples=wandb.config.min_samples, prediction_data=run.config.prediction_data)

            # Step 4 - Tokenize topics
            # vectorizer_model = TfidfVectorizer(ngram_range=(1, run.config.ngram_range))

            # Step 5 - Create topic representation
            # ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=run.config.reduce_frequent_words)

            # # Step 6 - Fine-tune topic representation
            # representation_model = KeyBERTInspired()

            # All steps together
            topic_model = BERTopic(
                embedding_model=embedding_model,
                umap_model=umap_model,
                hdbscan_model=hdbscan_model,
                # vectorizer_model=vectorizer_model,
                # ctfidf_model=ctfidf_model,
                # representation_model=representation_model,
                calculate_probabilities=run.config.calculate_probabilities
            )

            topic_model = topic_model.fit(self.docs)
            # topic_model.reduce_topics(self.docs, nr_topics='auto')

            # Preprocess Documents
            documents = pd.DataFrame({"Document": self.docs,
                                      "ID": range(len(self.docs)),
                                      "Topic": topic_model.topics_})
            documents_per_topic = documents.groupby(
                ['Topic'], as_index=False).agg({'Document': ' '.join})
            cleaned_docs = topic_model._preprocess_text(
                documents_per_topic.Document.values)

            # Extract vectorizer and analyzer from BERTopic
            vectorizer = topic_model.vectorizer_model
            analyzer = vectorizer.build_analyzer()

            # Extract features for Topic Coherence evaluation
            tokens = [analyzer(doc) for doc in cleaned_docs]
            dictionary = corpora.Dictionary(tokens)
            corpus = [dictionary.doc2bow(token) for token in tokens]
            topic_words = [[words for words, _ in topic_model.get_topic(
                topic)] for topic in range(len(set(topic_model.topics_))-1)]

            coherence_cv = CoherenceModel(
                topics=topic_words,
                texts=tokens,
                corpus=corpus,
                dictionary=dictionary,
                coherence='c_v'
            )

            coherence_umass = CoherenceModel(
                topics=topic_words,
                texts=tokens,
                corpus=corpus,
                dictionary=dictionary,
                coherence='u_mass'
            )

            coherence_cuci = CoherenceModel(
                topics=topic_words,
                texts=tokens,
                corpus=corpus,
                dictionary=dictionary,
                coherence='c_uci'
            )

            coherence_cnpmi = CoherenceModel(
                topics=topic_words,
                texts=tokens,
                corpus=corpus,
                dictionary=dictionary,
                coherence='c_npmi'
            )

            wandb.log({'Coherence CV': coherence_cv.get_coherence()})
            wandb.log({'Coherence UMASS': coherence_umass.get_coherence()})
            wandb.log({'Coherence UCI': coherence_cuci.get_coherence()})
            wandb.log({'Coherence NPMI': coherence_cnpmi.get_coherence()})
            wandb.log({'Topic Number': topic_model.get_topic_info().shape[0] - 1})
            wandb.log({'Uncategorized Post Number': topic_model.get_topic_info().at[0, 'Count']})

            model_name = f'{config_sweep["name"]}_{run.id}'
            topic_model.save(os.path.join(self.path_model, model_name))

    def sweep(self):
        wandb.login()
        sweep_id = wandb.sweep(config_sweep, project=wandb_project)
        wandb.agent(sweep_id, function=self.__train)

About this issue

  • Original URL
  • State: open
  • Created a year ago
  • Comments: 24 (10 by maintainers)

Most upvoted comments

@liaoelton Yes, that makes sense! I think you would only need to update the OpenAI backend to change any incoming "" to " ". I think doing any changes might make changes more isolated than if we were to change it throughout all embeddings.

I removed Chinese punctuation and symbols from documents, and it worked! It seems that OpenAI cannot properly process them. Thank you @MaartenGr !

@chentitus If you google this error, you will notice that there might be several reasons for this happening. One that I saw frequently is the token limit of the input documents. Make sure that the documents are not too large for the embedding model that you choose.