whisper-plus

<div align="center"> <h2> WhisperPlus：更快、更智能、功能更强大 🚀 </h2> <div> <img width="500" alt="预览图" src="https://raw.githubusercontent.com/kadirnar/whisper-plus/main/doc\openai-whisper.jpg"> </div> <div> <a href="https://pypi.org/project/whisperplus" target="_blank"> <img src="https://yellow-cdn.veclightyear.com/835a84d5/158c71df-a1d2-443e-bf7e-7d05b52dba33.svg?color=%2334D058" alt="支持的Python版本"> </a> <a href="https://badge.fury.io/py/whisperplus"><img src="https://yellow-cdn.veclightyear.com/835a84d5/0c3b1fea-6784-4bd6-9dc6-1e8ad54c5bf3.svg" alt="pypi版本"></a> <a href="https://huggingface.co/spaces/ArtGAN/Audio-WebUI"><img src="https://yellow-cdn.veclightyear.com/835a84d5/62e24fd5-6b37-4d0e-af4e-cc7aba6ed4f1.svg" alt="HuggingFace Spaces"></a> </div> </div>

🛠️ 安装

pip install whisperplus git+https://github.com/huggingface/transformers
pip install flash-attn --no-build-isolation

🤗 模型中心

你可以在HuggingFace模型中心找到这些模型

🎙️ 使用方法

要使用whisperplus库，请按照以下步骤进行不同的任务：

🎵 YouTube链接转音频

from whisperplus import SpeechToTextPipeline, download_youtube_to_mp3
from transformers import BitsAndBytesConfig, HqqConfig
import torch

url = "https://www.youtube.com/watch?v=di3rHkEZuUw"
audio_path = download_youtube_to_mp3(url, output_dir="downloads", filename="test")

hqq_config = HqqConfig(
    nbits=4,
    group_size=64,
    quant_zero=False,
    quant_scale=False,
    axis=0,
    offload_meta=False,
)  # 默认使用axis=0

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

pipeline = SpeechToTextPipeline(
    model_id="distil-whisper/distil-large-v3",
    quant_config=hqq_config,
    flash_attention_2=True,
)

transcript = pipeline(
    audio_path=audio_path,
    chunk_length_s=30,
    stride_length_s=5,
    max_new_tokens=128,
    batch_size=100,
    language="english",
    return_timestamps=False,
)

print(transcript)

🍎 Apple MLX

from whisperplus.pipelines import mlx_whisper
from whisperplus import download_youtube_to_mp3

url = "https://www.youtube.com/watch?v=1__CAdTJ5JU"
audio_path = download_youtube_to_mp3(url)

text = mlx_whisper.transcribe(
    audio_path, path_or_hf_repo="mlx-community/whisper-large-v3-mlx"
)["text"]
print(text)

🍏 Lightning Mlx Whisper

from whisperplus.pipelines.lightning_whisper_mlx import LightningWhisperMLX
from whisperplus import download_youtube_to_mp3

url = "https://www.youtube.com/watch?v=1__CAdTJ5JU"
audio_path = download_youtube_to_mp3(url)

whisper = LightningWhisperMLX(model="distil-large-v3", batch_size=12, quant=None)
output = whisper.transcribe(audio_path=audio_path)["text"]

📰 文本摘要

from whisperplus.pipelines.summarization import TextSummarizationPipeline

summarizer = TextSummarizationPipeline(model_id="facebook/bart-large-cnn")
summary = summarizer.summarize(transcript)
print(summary[0]["summary_text"])

📰 长文本支持摘要

from whisperplus.pipelines.long_text_summarization import LongTextSummarizationPipeline

summarizer = LongTextSummarizationPipeline(model_id="facebook/bart-large-cnn")
summary_text = summarizer.summarize(transcript)
print(summary_text)

💬 说话人分离

你必须确认以下两个模型的许可权限。

pip install -r requirements/speaker_diarization.txt
pip install -U "huggingface_hub[cli]"
huggingface-cli login

from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline
from whisperplus import download_youtube_to_mp3, format_speech_to_dialogue

audio_path = download_youtube_to_mp3("https://www.youtube.com/watch?v=mRB14sFHw2E")

device = "cuda"  # cpu 或 mps
pipeline = ASRDiarizationPipeline.from_pretrained(
    asr_model="openai/whisper-large-v3",
    diarizer_model="pyannote/speaker-diarization-3.1",
    use_auth_token=False,
    chunk_length_s=30,
    device=device,
)

output_text = pipeline(audio_path, num_speakers=2, min_speaker=1, max_speaker=2)
dialogue = format_speech_to_dialogue(output_text)
print(dialogue)

⭐ RAG - 与视频对话(LanceDB)

pip install sentence-transformers ctransformers langchain

from whisperplus.pipelines.chatbot import ChatWithVideo

chat = ChatWithVideo(
    input_file="trascript.txt",
    llm_model_name="TheBloke/Mistral-7B-v0.1-GGUF",
    llm_model_file="mistral-7b-v0.1.Q4_K_M.gguf",
    llm_model_type="mistral",
    embedding_model_name="sentence-transformers/all-MiniLM-L6-v2",
)

query = "这个视频讲的是什么？"
response = chat.run_query(query)
print(response)

🌠 RAG - 与视频对话(AutoLLM)

pip install autollm>=0.1.9

from whisperplus.pipelines.autollm_chatbot import AutoLLMChatWithVideo

# service_context_params
system_prompt = """
你是一个友好的AI助手，帮助用户根据你能访问的文档找到与他们问题最相关和准确的答案。
回答问题时，主要依赖文档中的信息。
"""
query_wrapper_prompt = """
以下是文档信息。
---------------------
{context_str}
---------------------
使用文档信息并主要依赖它来回答查询。
查询：{query_str}
回答：
"""

chat = AutoLLMChatWithVideo(
    input_file="input_dir",  # mp3文件路径
    openai_key="YOUR_OPENAI_KEY",  # 可选
    huggingface_key="YOUR_HUGGINGFACE_KEY",  # 可选
    llm_model="gpt-3.5-turbo",
    llm_max_tokens="256",
    llm_temperature="0.1",
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    embed_model="huggingface/BAAI/bge-large-zh",  # "text-embedding-ada-002"
)

query = "这个视频讲的是什么？"
response = chat.run_query(query)
print(response)

🎙️ 文字转语音

from whisperplus.pipelines.text2speech import TextToSpeechPipeline

tts = TextToSpeechPipeline(model_id="suno/bark")
audio = tts(text="你好，世界", voice_preset="v2/en_speaker_6")

🎥 自动字幕

pip install moviepy
apt install imagemagick libmagick++-dev
cat /etc/ImageMagick-6/policy.xml | sed 's/none/read,write/g'> /etc/ImageMagick-6/policy.xml

from whisperplus.pipelines.whisper_autocaption import WhisperAutoCaptionPipeline
from whisperplus import download_youtube_to_mp4

video_path = download_youtube_to_mp4(
    "https://www.youtube.com/watch?v=di3rHkEZuUw",
    output_dir="downloads",
    filename="test",
)  # 可选

caption = WhisperAutoCaptionPipeline(model_id="openai/whisper-large-v3")
caption(video_path=video_path, output_path="output.mp4", language="chinese")

😍 贡献

pip install pre-commit
pre-commit install
pre-commit run --all-files

📜 许可证

本项目根据Apache License 2.0的条款进行许可。

🤗 引用

@misc{radford2022whisper,
  doi = {10.48550/ARXIV.2212.04356},
  url = {https://arxiv.org/abs/2212.04356},
  author = {Radford, Alec and Kim, Jong Wook and Xu, Tao and Brockman, Greg and McLeavey, Christine and Sutskever, Ilya},
  title = {Robust Speech Recognition via Large-Scale Weak Supervision},
  publisher = {arXiv},
  year = {2022},
  copyright = {arXiv.org perpetual, non-exclusive license}
}