Constructing Q&A Methods with DistilBERT and Transformers


import collections

import time

from dataclasses import dataclass

 

import torch

from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering, pipeline

 

@dataclass

class QAConfig:

    “”“Configuration for QA settings”“”

    max_sequence_length: int = 512

    max_answer_length: int = 50

    top_k: int = 3

    threshold: float = 0.5

 

class QASystem:

    “”“Q&A system with chunking”“”

    def __init__(self, model_name=“distilbert-base-uncased-distilled-squad”, gadget=None):

        self.gadget = gadget or (“cuda” if torch.cuda.is_available() else “cpu”)

        self.tokenizer = DistilBertTokenizer.from_pretrained(model_name)

        self.mannequin = DistilBertForQuestionAnswering.from_pretrained(model_name)

 

        # Initialize pipeline for easy queries and reply cache

        self.qa_pipeline = pipeline(“question-answering”, mannequin=model_name,

                                    tokenizer=model_name, gadget=self.gadget)

        self.answer_cache = {}

 

    def preprocess_context(self, context, max_length=512):

      “”“Break up lengthy contexts into chunks under max_length”“”

      chunks = []

      current_chunk = []

      current_length = 0

 

      for phrase in context.cut up():

          if current_length + 1 + len(phrase) > max_length:

              chunks.append(” “.be part of(current_chunk))

              current_chunk = [word]

              current_length = len(phrase)

          else:

              current_chunk.append(phrase)

              current_length += 1 + len(phrase)  # size of area + phrase

 

      # Add the final chunk if it isn’t empty

      if current_chunk:

          chunks.append(” “.be part of(current_chunk))

 

      return chunks

 

    def get_answer(self, query, context, config):

        “”“Get reply with confidence rating”“”

        # Verify cache

        cache_key = (query, context)

        if cache_key in self.answer_cache:

            return self.answer_cache[cache_key]

 

        # Preprocess context into chunks

        context_chunks = self.preprocess_context(context, config.max_sequence_length)

 

        # Get solutions from all chunks

        solutions = []

        for chunk in context_chunks:

            consequence = self.qa_pipeline(query=query,

                                      context=chunk,

                                      max_answer_len=config.max_answer_length,

                                      top_k=config.top_k)

            assert isinstance(consequence, checklist)

            for reply in consequence:

                if reply[“score”] >= config.threshold:

                    solutions.append(reply)

 

        # Return the very best reply or point out no reply discovered

        if solutions:

            best_answer = max(solutions, key=lambda x: x[“score”])

            consequence = {

                “reply”: best_answer[“answer”],

                “confidence”: best_answer[“score”],

            }

        else:

            consequence = {

                “reply”: “No reply discovered”,

                “confidence”: 0.0,

            }

 

        # Cache the consequence

        self.answer_cache[cache_key] = consequence

        return consequence

 

class ContextManager:

    def __init__(self, max_contexts=10):

        self.contexts = collections.OrderedDict()

        self.max_contexts = max_contexts

 

    def add_context(self, context_id, context):

        “”“Add context with automated cleanup”“”

        if len(self.contexts) >= self.max_contexts:

            self.contexts.popitem(final=False)

        self.contexts[context_id] = context

 

    def get_context(self, context_id):

        “”“Get context by ID”“”

        return self.contexts.get(context_id)

 

    def search_relevant_context(self, query, top_k=3):

        “”“Seek for related contexts based mostly on relevance rating”“”

        relevant_contexts = []

        for context_id, context in self.contexts.objects():

            relevance_score = self._calculate_relevance(query, context)

            relevant_contexts.append((relevance_score, context_id))

        return sorted(relevant_contexts, reverse=True)[:top_k]

 

    def _calculate_relevance(self, query, context):

        “”“Calculate relevance rating between query and context.

        This can be a easy counting the variety of overlap phrases

        ““”

        question_words = set(query.decrease().cut up())

        context_words = set(context.decrease().cut up())

        return len(question_words.intersection(context_words)) / len(question_words)

 

 

context_manager = ContextManager(max_contexts=10)

context_manager.add_context(“python”, “”

    Python is a high-level, interpreted programming language created by Guido van Rossum and launched in 1991.

    Python’s design philosophy emphasizes code readability with its notable use of great whitespace.

    Python includes a dynamic kind system and automated reminiscence administration and helps a number of programming

    paradigms, together with structured, object-oriented, and purposeful programming.

“”)

context_manager.add_context(“machine_learning”, “”

    Machine studying is a area of examine that provides computer systems the flexibility to be taught with out being

    explicitly programmed. It’s a department of synthetic intelligence based mostly on the concept that techniques

    can be taught from information, determine patterns and make selections with minimal human intervention.

“”)

 

config = QAConfig(max_sequence_length=512, max_answer_length=50, threshold=0.5)

qa_system = QASystem()

query = “Who created Python?”

relevant_contexts = context_manager.search_relevant_context(query, top_k=1)

if relevant_contexts:

    relevance, context_id = relevant_contexts[0]

    context = context_manager.get_context(context_id)

    print(f“Query: {query}”)

    print(f“Most related context: {context_id} (relevance: {relevance:.2f})”)

    print(context)

 

    reply = qa_system.get_answer(query, context, config)

    print(f“Reply: {reply[‘answer’]}”)

    print(f“Confidence: {reply[‘confidence’]:.2f}”)

else:

    print(“No related context discovered.”)

Leave a Reply

Your email address will not be published. Required fields are marked *