BERTopic: Invalid request using OpenAI backend.

It always returns an invalid request error when using the OpenAI model. I am not sure why this happens. Any idea? @MaartenGr

InvalidRequestError(message="'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", param=None, code=None, http_status=400, request_id=None)

My code is:

import gensim.corpora as corpora
import pandas as pd
import openai
import wandb
import os

from gensim.parsing.preprocessing import strip_punctuation
# from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.coherencemodel import CoherenceModel
# from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.backend import OpenAIBackend

# from bertopic.representation import KeyBERTInspired
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP

path_output = os.path.join(os.getcwd(), 'Result', 'RQ1', 'Special Topics')
path_model = os.path.join(os.getcwd(), 'Code', 'RQ1', 'Special Topic Modeling', 'Model')
if not os.path.exists(path_model):
    os.makedirs(path_model)

wandb_project = 'asset-management-topic-modeling'
openai.api_key = os.getenv('OPENAI_API_KEY')

os.environ["WANDB_API_KEY"] = 'xxxxxx'
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["WANDB__SERVICE_WAIT"] = "100"

# set default sweep configuration
config_defaults = {
    # Refer to https://platform.openai.com/docs/models/embeddings
    'model_name': 'text-embedding-ada-002',
    'metric_distane': 'cosine',
    'calculate_probabilities': True,
    'reduce_frequent_words': True,
    'prediction_data': True,
    'low_memory': False,
    'random_state': 42,
    'ngram_range': 2,
}

config_sweep = {
    'method': 'grid',
    'metric': {
        'name': 'Coherence CV',
        'goal': 'maximize'
    },
    'parameters': {
        'n_components': {
            'values': list(range(3,11)),
        },
    }
}


class TopicModeling:
    def __init__(self, topic_type, min_cluster_size=20):
        # Initialize an empty list to store top models
        self.top_models = []
        self.path_model = path_model
        
        df = pd.read_json(os.path.join(path_output, 'labels.json'))
        if topic_type == 'anomaly':
            df = df[df['Challenge_type'] == 'anomaly']
            self.docs = df[df['Challenge_summary'] != 'na']['Challenge_summary'].tolist() + df[df['Challenge_root_cause'] != 'na']['Challenge_root_cause'].tolist()
        elif topic_type == 'solution':
            docs = df[df['Solution'] != 'na']['Solution'].tolist()
            self.docs = [strip_punctuation(doc) for doc in docs]
        
        config_defaults['min_cluster_size'] = min_cluster_size
        config_sweep['name'] = topic_type
        config_sweep['parameters']['min_samples'] = {
            'values': list(range(1, config_defaults['min_cluster_size'] + 1))
        }
        
    def __train(self):
        # Initialize a new wandb run
        with wandb.init() as run:
            # update any values not set by sweep
            run.config.setdefaults(config_defaults)

            # Step 1 - Extract embeddings
            embedding_model = OpenAIBackend("text-embedding-ada-002")

            # Step 2 - Reduce dimensionality
            umap_model = UMAP(n_components=wandb.config.n_components, metric=run.config.metric_distane,
                              random_state=run.config.random_state, low_memory=run.config.low_memory)

            # Step 3 - Cluster reduced embeddings
            hdbscan_model = HDBSCAN(min_cluster_size=run.config.min_cluster_size,
                                    min_samples=wandb.config.min_samples, prediction_data=run.config.prediction_data)

            # Step 4 - Tokenize topics
            # vectorizer_model = TfidfVectorizer(ngram_range=(1, run.config.ngram_range))

            # Step 5 - Create topic representation
            # ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=run.config.reduce_frequent_words)

            # # Step 6 - Fine-tune topic representation
            # representation_model = KeyBERTInspired()

            # All steps together
            topic_model = BERTopic(
                embedding_model=embedding_model,
                umap_model=umap_model,
                hdbscan_model=hdbscan_model,
                # vectorizer_model=vectorizer_model,
                # ctfidf_model=ctfidf_model,
                # representation_model=representation_model,
                calculate_probabilities=run.config.calculate_probabilities
            )

            topic_model = topic_model.fit(self.docs)
            # topic_model.reduce_topics(self.docs, nr_topics='auto')

            # Preprocess Documents
            documents = pd.DataFrame({"Document": self.docs,
                                      "ID": range(len(self.docs)),
                                      "Topic": topic_model.topics_})
            documents_per_topic = documents.groupby(
                ['Topic'], as_index=False).agg({'Document': ' '.join})
            cleaned_docs = topic_model._preprocess_text(
                documents_per_topic.Document.values)

            # Extract vectorizer and analyzer from BERTopic
            vectorizer = topic_model.vectorizer_model
            analyzer = vectorizer.build_analyzer()

            # Extract features for Topic Coherence evaluation
            tokens = [analyzer(doc) for doc in cleaned_docs]
            dictionary = corpora.Dictionary(tokens)
            corpus = [dictionary.doc2bow(token) for token in tokens]
            topic_words = [[words for words, _ in topic_model.get_topic(
                topic)] for topic in range(len(set(topic_model.topics_))-1)]

            coherence_cv = CoherenceModel(
                topics=topic_words,
                texts=tokens,
                corpus=corpus,
                dictionary=dictionary,
                coherence='c_v'
            )

            coherence_umass = CoherenceModel(
                topics=topic_words,
                texts=tokens,
                corpus=corpus,
                dictionary=dictionary,
                coherence='u_mass'
            )

            coherence_cuci = CoherenceModel(
                topics=topic_words,
                texts=tokens,
                corpus=corpus,
                dictionary=dictionary,
                coherence='c_uci'
            )

            coherence_cnpmi = CoherenceModel(
                topics=topic_words,
                texts=tokens,
                corpus=corpus,
                dictionary=dictionary,
                coherence='c_npmi'
            )

            wandb.log({'Coherence CV': coherence_cv.get_coherence()})
            wandb.log({'Coherence UMASS': coherence_umass.get_coherence()})
            wandb.log({'Coherence UCI': coherence_cuci.get_coherence()})
            wandb.log({'Coherence NPMI': coherence_cnpmi.get_coherence()})
            wandb.log({'Topic Number': topic_model.get_topic_info().shape[0] - 1})
            wandb.log({'Uncategorized Post Number': topic_model.get_topic_info().at[0, 'Count']})

            model_name = f'{config_sweep["name"]}_{run.id}'
            topic_model.save(os.path.join(self.path_model, model_name))

    def sweep(self):
        wandb.login()
        sweep_id = wandb.sweep(config_sweep, project=wandb_project)
        wandb.agent(sweep_id, function=self.__train)

About this issue

Original URL
State: open
Created a year ago
Comments: 24 (10 by maintainers)

Most upvoted comments

@liaoelton Yes, that makes sense! I think you would only need to update the OpenAI backend to change any incoming "" to " ". I think doing any changes might make changes more isolated than if we were to change it throughout all embeddings.

MaartenGr on Feb 20, 2024

I removed Chinese punctuation and symbols from documents, and it worked! It seems that OpenAI cannot properly process them. Thank you @MaartenGr !

chentitus on Jan 4, 2024

@chentitus If you google this error, you will notice that there might be several reasons for this happening. One that I saw frequently is the token limit of the input documents. Make sure that the documents are not too large for the embedding model that you choose.

MaartenGr on Jan 3, 2024