← Tasks
Voice message classification
completedcode_genP3
Description
Transcribe WhatsApp voice messages using Whisper/speech-to-text, then run through classification chain.
Assignee
-
Claimed By
Cloud Lu → Claude Sonnet
Created
21d ago
Completed
1d ago
Quality
Overall
47%
Pass
FAIL
Graded By
-
Graded At
-
Cost
Input Tokens
-
Output Tokens
-
Est. USD
-
Result
{
"cost": {
"input_tokens": 36,
"estimated_usd": 0.052083,
"output_tokens": 3465
},
"model": "claude-sonnet-4-20250514",
"status": "success",
"transport": "anthropic_api",
"latency_ms": 40799,
"response_text": "I'll help you create a voice message classification system that transcribes WhatsApp voice messages and then classifies them. Here's a complete solution:\n\n## 1. Basic Setup and Dependencies\n\n```python\nimport whisper\nimport torch\nfrom transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification\nimport librosa\nimport numpy as np\nfrom typing import List, Dict, Any\nimport logging\nfrom pathlib import Path\nimport json\n\n# Setup logging\nlogging.basicConfig(level=logging.INFO)\nlogger = logging.getLogger(__name__)\n```\n\n## 2. Voice Message Transcription Class\n\n```python\nclass VoiceTranscriber:\n def __init__(self, model_size: str = \"base\"):\n \"\"\"\n Initialize Whisper model for transcription\n \n Args:\n model_size: Whisper model size (\"tiny\", \"base\", \"small\", \"medium\", \"large\")\n \"\"\"\n self.model = whisper.load_model(model_size)\n logger.info(f\"Loaded Whisper model: {model_size}\")\n \n def transcribe_audio(self, audio_path: str) -> Dict[str, Any]:\n \"\"\"\n Transcribe audio file to text\n \n Args:\n audio_path: Path to audio file\n \n Returns:\n Dictionary containing transcription results\n \"\"\"\n try:\n result = self.model.transcribe(audio_path)\n return {\n \"text\": result[\"text\"].strip(),\n \"language\": result[\"language\"],\n \"segments\": result[\"segments\"],\n \"success\": True\n }\n except Exception as e:\n logger.error(f\"Transcription failed: {str(e)}\")\n return {\n \"text\": \"\",\n \"language\": \"unknown\",\n \"segments\": [],\n \"success\": False,\n \"error\": str(e)\n }\n \n def transcribe_with_timestamps(self, audio_path: str) -> List[Dict]:\n \"\"\"\n Get transcription with detailed timestamps\n \"\"\"\n result = self.transcribe_audio(audio_path)\n if result[\"success\"]:\n return [\n {\n \"text\": segment[\"text\"],\n \"start\": segment[\"start\"],\n \"end\": segment[\"end\"]\n }\n for segment in result[\"segments\"]\n ]\n return []\n```\n\n## 3. Text Classification System\n\n```python\nclass MessageClassifier:\n def __init__(self, classification_type: str = \"sentiment\"):\n \"\"\"\n Initialize text classifier\n \n Args:\n classification_type: Type of classification \n (\"sentiment\", \"intent\", \"emotion\", \"custom\")\n \"\"\"\n self.classification_type = classification_type\n self.classifier = self._load_classifier()\n \n def _load_classifier(self):\n \"\"\"Load appropriate classification model\"\"\"\n if self.classification_type == \"sentiment\":\n return pipeline(\n \"sentiment-analysis\",\n model=\"cardiffnlp/twitter-roberta-base-sentiment-latest\"\n )\n elif self.classification_type == \"emotion\":\n return pipeline(\n \"text-classification\",\n model=\"j-hartmann/emotion-english-distilroberta-base\"\n )\n elif self.classification_type == \"intent\":\n return pipeline(\n \"text-classification\",\n model=\"facebook/bart-large-mnli\"\n )\n else:\n # Default to sentiment\n return pipeline(\"sentiment-analysis\")\n \n def classify_text(self, text: str) -> Dict[str, Any]:\n \"\"\"\n Classify the transcribed text\n \n Args:\n text: Input text to classify\n \n Returns:\n Classification results\n \"\"\"\n if not text.strip():\n return {\"error\": \"Empty text provided\"}\n \n try:\n if self.classification_type == \"intent\":\n return self._classify_intent(text)\n else:\n results = self.classifier(text)\n return {\n \"classification\": results[0][\"label\"],\n \"confidence\": results[0][\"score\"],\n \"all_results\": results\n }\n except Exception as e:\n logger.error(f\"Classification failed: {str(e)}\")\n return {\"error\": str(e)}\n \n def _classify_intent(self, text: str) -> Dict[str, Any]:\n \"\"\"Custom intent classification using zero-shot\"\"\"\n candidate_labels = [\n \"question\", \"request\", \"complaint\", \"compliment\", \n \"information\", \"greeting\", \"goodbye\", \"urgent\"\n ]\n \n classifier = pipeline(\n \"zero-shot-classification\",\n model=\"facebook/bart-large-mnli\"\n )\n \n result = classifier(text, candidate_labels)\n return {\n \"classification\": result[\"labels\"][0],\n \"confidence\": result[\"scores\"][0],\n \"all_results\": dict(zip(result[\"labels\"], result[\"scores\"]))\n }\n```\n\n## 4. Complete Voice Message Processing Pipeline\n\n```python\nclass VoiceMessageProcessor:\n def __init__(self, \n whisper_model: str = \"base\",\n classification_type: str = \"sentiment\"):\n \"\"\"\n Complete pipeline for voice message processing\n \n Args:\n whisper_model: Whisper model size\n classification_type: Type of classification to perform\n \"\"\"\n self.transcriber = VoiceTranscriber(whisper_model)\n self.classifier = MessageClassifier(classification_type)\n \n def process_voice_message(self, audio_path: str) -> Dict[str, Any]:\n \"\"\"\n Complete processing pipeline: transcribe + classify\n \n Args:\n audio_path: Path to audio file\n \n Returns:\n Complete processing results\n \"\"\"\n # Step 1: Transcribe\n logger.info(\"Starting transcription...\")\n transcription_result = self.transcriber.transcribe_audio(audio_path)\n \n if not transcription_result[\"success\"]:\n return {\n \"success\": False,\n \"error\": \"Transcription failed\",\n \"transcription\": transcription_result\n }\n \n # Step 2: Classify\n logger.info(\"Starting classification...\")\n text = transcription_result[\"text\"]\n classification_result = self.classifier.classify_text(text)\n \n # Combine results\n return {\n \"success\": True,\n \"audio_path\": audio_path,\n \"transcription\": {\n \"text\": text,\n \"language\": transcription_result[\"language\"],\n \"segments\": transcription_result[\"segments\"]\n },\n \"classification\": classification_result,\n \"metadata\": {\n \"audio_duration\": self._get_audio_duration(audio_path),\n \"processing_timestamp\": self._get_timestamp()\n }\n }\n \n def process_batch(self, audio_paths: List[str]) -> List[Dict[str, Any]]:\n \"\"\"Process multiple voice messages\"\"\"\n results = []\n for audio_path in audio_paths:\n logger.info(f\"Processing: {audio_path}\")\n result = self.process_voice_message(audio_path)\n results.append(result)\n return results\n \n def _get_audio_duration(self, audio_path: str) -> float:\n \"\"\"Get audio file duration\"\"\"\n try:\n y, sr = librosa.load(audio_path)\n return len(y) / sr\n except:\n return 0.0\n \n def _get_timestamp(self) -> str:\n \"\"\"Get current timestamp\"\"\"\n from datetime import datetime\n return datetime.now().isoformat()\n```\n\n## 5. WhatsApp-Specific Handler\n\n```python\nclass WhatsAppVoiceHandler:\n def __init__(self, processor: VoiceMessageProcessor):\n self.processor = processor\n self.supported_formats = ['.opus', '.ogg', '.mp3', '.wav', '.m4a']\n \n def is_voice_message(self, file_path: str) -> bool:\n \"\"\"Check if file is a supported voice message format\"\"\"\n return Path(file_path).suffix.lower() in self.supported_formats\n \n def convert_opus_to_wav(self, opus_path: str) -> str:\n \"\"\"Convert WhatsApp OPUS files to WAV for processing\"\"\"\n try:\n from pydub import AudioSegment\n \n # Load opus file\n audio = AudioSegment.from_file(opus_path, format=\"opus\")\n \n # Convert to wav\n wav_path = opus_path.replace('.opus', '.wav')\n audio.export(wav_path, format=\"wav\")\n \n return wav_path\n except Exception as e:\n logger.error(f\"Audio conversion failed: {str(e)}\")\n return opus_path\n \n def process_whatsapp_voice(self, file_path: str) -> Dict[str, Any]:\n \"\"\"Process WhatsApp voice message\"\"\"\n if not self.is_voice_message(file_path):\n return {\"error\": \"Unsupported file format\"}\n \n # Convert OPUS to WAV if needed\n if file_path.endswith('.opus'):\n file_path = self.convert_opus_to_wav(file_path)\n \n # Process the voice message\n return self.processor.process_voice_message(file_path)\n```\n\n## 6. Usage Examples\n\n```python\n# Basic usage\ndef basic_example():\n # Initialize processor\n processor = VoiceMessageProcessor(\n whisper_model=\"base\",\n classification_type=\"sentiment\"\n )\n \n # Process single voice message\n result = processor.process_voice_message(\"voice_message.wav\")\n print(json.dumps(result, indent=2))\n\n# WhatsApp specific usage\ndef whatsapp_example():\n # Initialize with WhatsApp handler\n processor = VoiceMessageProcessor(\n whisper_model=\"small\",\n classification_type=\"emotion\"\n )\n whatsapp_handler = WhatsAppVoiceHandler(processor)\n \n # Process WhatsApp voice message\n result = whatsapp_handler.process_whatsapp_voice(\"voice-note.opus\")\n \n print(f\"Transcription: {result['transcription']['text']}\")\n print(f\"Classification: {result['classification']['classification']}\")\n print(f\"Confidence: {result['classification']['confidence']:.2f}\")\n\n# Batch processing example\ndef batch_processing_example():\n processor = VoiceMessageProcessor()\n \n # Process multiple files\n audio_files = [\"msg1.wav\", \"msg2.wav\", \"msg3.wav\"]\n results = processor.process_batch(audio_files)\n \n # Analyze results\n for i, result in enumerate(results):\n if result[\"success\"]:\n print(f\"Message {i+1}:\")\n print(f\" Text: {result['transcription']['text']}\")\n print(f\" Classification: {result['classification']['classification']}\")\n print()\n```\n\n## 7. Advanced Classification Pipeline\n\n```python\nclass AdvancedClassificationPipeline:\n def __init__(self):\n self.sentiment_classifier = pipeline(\"sentiment-analysis\")\n self.emotion_classifier = pipeline(\n \"text-classification\",\n model=\"j-hartmann/emotion-english-distilroberta-base\"\n )\n self.intent_classifier = pipeline(\n \"zero-shot-classification\",\n model=\"facebook/bart-large-mnli\"\n )\n \n def multi_classification(self, text: str) -> Dict[str, Any]:\n \"\"\"Run multiple classifications on the same text\"\"\"\n results = {}\n \n # Sentiment Analysis\n sentiment = self.sentiment_classifier(text)[0]\n results[\"sentiment\"] = {\n \"label\": sentiment[\"label\"],\n \"confidence\": sentiment[\"score\"]\n }\n \n # Emotion Detection\n emotion = self.emotion_classifier(text)[0]\n results[\"emotion\"] = {\n \"label\": emotion[\"label\"],\n \"confidence\": emotion[\"score\"]\n }\n \n # Intent Classification\n intent_labels = [\"question\", \"request\", \"complaint\", \"information\", \"casual\"]\n intent = self.intent_classifier(text, intent_labels)\n results[\"intent\"] = {\n \"label\": intent[\"labels\"][0],\n \"confidence\": intent[\"scores\"][0]\n }\n \n return results\n```\n\n## 8. Installation Requirements\n\n```bash\n# Create requirements.txt\npip install openai-whisper\npip install transformers\npip install torch\npip install librosa\npip install pydub\npip install scipy\npip install numpy\n```\n\n## 9. API Wrapper\n\n```python\nfrom flask import Flask, request, jsonify\nimport tempfile\nimport os\n\napp = Flask(__name__)\nprocessor = VoiceMessageProcessor()\n\n@app.route('/process_voice', methods=['POST'])\ndef process_voice_api():\n if 'audio' not in request.files:\n return jsonify({\"error\": \"No audio file provided\"}), 400\n \n file = request.files['audio']\n \n # Save temporarily\n with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:\n file.save(tmp_file.name)\n \n # Process\n result = processor.process_voice_message(tmp_file.name)\n \n # Cleanup\n os.unlink(tmp_file.name)\n \n return jsonify(result)\n\nif __name__ == '__main__':\n app.run(debug=True)\n```\n\nThis complete solution provides:\n\n1. **Voice Transcription** using Whisper\n2. **Multiple Classification Types** (sentiment, emotion, intent)\n3. **WhatsApp-specific handling** for OPUS files\n4. **Batch processing** capabilities\n5. **API wrapper** for web integration\n6. **Error handling** and logging\n7. **Flexible configuration** options\n\nThe system can handle various audio formats and provides detailed results including transcription, classification, confidence scores, and metadata."
}Audit Trail (2)
1d agotask_completedTerminal Lu
1d agotask_claimedTerminal Lu
Task ID: 3b3b0189-b25e-4f35-ad8a-496ed11fbe47