William Tambellini commited on
Commit
c160b58
·
unverified ·
1 Parent(s): b4d05df

examples : add support for decoding input with ffmpeg (Linux) (#2133)

Browse files

- search for ffmpeg libs/headers at cmake time
- added ffmpeg-transcode.cpp into libcommon if ffmpeg on
- hooked ffmpeg trancoding in common read_wav(...)
- passed test:
./main -m ggml-base.en.bin -f samples/jfk.mp3

CMakeLists.txt CHANGED
@@ -59,6 +59,10 @@ option(WHISPER_BUILD_EXAMPLES "whisper: build examples" ${WHISPER_STANDA
59
 
60
  option(WHISPER_SDL2 "whisper: support for libSDL2" OFF)
61
 
 
 
 
 
62
  option(WHISPER_NO_AVX "whisper: disable AVX" OFF)
63
  option(WHISPER_NO_AVX2 "whisper: disable AVX2" OFF)
64
  option(WHISPER_NO_AVX512 "whisper: disable AVX512" ON)
@@ -125,6 +129,26 @@ else()
125
  set(CMAKE_CXX_STANDARD 11)
126
  endif()
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  # on APPLE
129
  if (APPLE)
130
  # include Accelerate framework
 
59
 
60
  option(WHISPER_SDL2 "whisper: support for libSDL2" OFF)
61
 
62
+ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
63
+ option(WHISPER_FFMPEG "whisper: support building and linking with ffmpeg libs (avcodec, swresample, ...)" OFF)
64
+ endif()
65
+
66
  option(WHISPER_NO_AVX "whisper: disable AVX" OFF)
67
  option(WHISPER_NO_AVX2 "whisper: disable AVX2" OFF)
68
  option(WHISPER_NO_AVX512 "whisper: disable AVX512" ON)
 
129
  set(CMAKE_CXX_STANDARD 11)
130
  endif()
131
 
132
+ if (WHISPER_FFMPEG)
133
+ # As of cmake 3.27, there is no official cmake support for FindFFmpeg.
134
+ # Consequnelty we added a FindFFmpeg.cmake script the cmake subfolder:
135
+ # whisper.cpp does not need the full ffmpeg libs, just AVFORMAT AVCODEC AVUTIL SWRESAMPLE
136
+ # libswresample performs highly optimized audio resampling, rematrixing and sample format conversion operations
137
+ # libavcodec provides a generic encoding/decoding framework and contains multiple decoders and encoders for audio, video and subtitle streams, and several bitstream filters.
138
+ # libavformat provides a generic framework for multiplexing and demultiplexing (muxing and demuxing) audio, video and subtitle streams.
139
+ find_package(FFmpeg REQUIRED)
140
+ if (NOT ${FFMPEG_FOUND})
141
+ message(FATAL_ERROR "Cannot find ffmpeg libs/headers")
142
+ endif()
143
+ message(STATUS "Found ffmpeg libs: ${FFMPEG_LIBRARIES}")
144
+ message(STATUS "Found ffmpeg headers in: ${FFMPEG_INCLUDE_DIRS}")
145
+ message(STATUS "ffmpeg definitions: ${FFMPEG_DEFINITIONS}")
146
+ message(STATUS "Found avformat ${AVFORMAT_VERSION}")
147
+ include_directories(${FFMPEG_INCLUDE_DIRS})
148
+ add_compile_definitions(WHISPER_FFMPEG)
149
+ set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${FFMPEG_LIBRARIES})
150
+ endif()
151
+
152
  # on APPLE
153
  if (APPLE)
154
  # include Accelerate framework
cmake/FindFFmpeg.cmake ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # From
2
+ # https://github.com/snikulov/cmake-modules/blob/master/FindFFmpeg.cmake
3
+ #
4
+ # vim: ts=2 sw=2
5
+ # - Try to find the required ffmpeg components(default: AVFORMAT, AVUTIL, AVCODEC)
6
+ #
7
+ # Once done this will define
8
+ # FFMPEG_FOUND - System has the all required components.
9
+ # FFMPEG_INCLUDE_DIRS - Include directory necessary for using the required components headers.
10
+ # FFMPEG_LIBRARIES - Link these to use the required ffmpeg components.
11
+ # FFMPEG_DEFINITIONS - Compiler switches required for using the required ffmpeg components.
12
+ #
13
+ # For each of the components it will additionally set.
14
+ # - AVCODEC
15
+ # - AVDEVICE
16
+ # - AVFORMAT
17
+ # - AVFILTER
18
+ # - AVUTIL
19
+ # - POSTPROC
20
+ # - SWSCALE
21
+ # the following variables will be defined
22
+ # <component>_FOUND - System has <component>
23
+ # <component>_INCLUDE_DIRS - Include directory necessary for using the <component> headers
24
+ # <component>_LIBRARIES - Link these to use <component>
25
+ # <component>_DEFINITIONS - Compiler switches required for using <component>
26
+ # <component>_VERSION - The components version
27
+ #
28
+ # Copyright (c) 2006, Matthias Kretz, <[email protected]>
29
+ # Copyright (c) 2008, Alexander Neundorf, <[email protected]>
30
+ # Copyright (c) 2011, Michael Jansen, <[email protected]>
31
+ #
32
+ # Redistribution and use is allowed according to the terms of the BSD license.
33
+ # For details see the accompanying COPYING-CMAKE-SCRIPTS file.
34
+
35
+ include(FindPackageHandleStandardArgs)
36
+
37
+ # The default components were taken from a survey over other FindFFMPEG.cmake files
38
+ if (NOT FFmpeg_FIND_COMPONENTS)
39
+ set(FFmpeg_FIND_COMPONENTS AVFORMAT AVCODEC AVUTIL SWRESAMPLE)
40
+ endif()
41
+
42
+ #
43
+ ### Macro: set_component_found
44
+ #
45
+ # Marks the given component as found if both *_LIBRARIES AND *_INCLUDE_DIRS is present.
46
+ #
47
+ macro(set_component_found _component )
48
+ if (${_component}_LIBRARIES AND ${_component}_INCLUDE_DIRS)
49
+ message(DEBUG " - ${_component} found.")
50
+ set(${_component}_FOUND TRUE)
51
+ else ()
52
+ message(DEBUG " - ${_component} not found.")
53
+ endif ()
54
+ endmacro()
55
+
56
+ #
57
+ ### Macro: find_component
58
+ #
59
+ # Checks for the given component by invoking pkgconfig and then looking up the libraries and
60
+ # include directories.
61
+ #
62
+ macro(find_component _component _pkgconfig _library _header)
63
+
64
+ if (NOT WIN32)
65
+ # use pkg-config to get the directories and then use these values
66
+ # in the FIND_PATH() and FIND_LIBRARY() calls
67
+ find_package(PkgConfig)
68
+ if (PKG_CONFIG_FOUND)
69
+ pkg_check_modules(PC_${_component} ${_pkgconfig})
70
+ message(STATUS "Pkgconfig found: ${PC_${_component}_INCLUDEDIR}")
71
+ message(STATUS "Pkgconfig found: ${PC_${_component}_INCLUDE_DIRS}")
72
+ message(STATUS "${PC_${_component}_CFLAGS}")
73
+ endif ()
74
+ endif (NOT WIN32)
75
+
76
+
77
+ find_path(${_component}_INCLUDE_DIRS ${_header}
78
+ HINTS
79
+ ${PC_${_component}_INCLUDEDIR}
80
+ ${PC_${_component}_INCLUDE_DIRS}
81
+ PATH_SUFFIXES
82
+ ffmpeg
83
+ )
84
+
85
+ # CMake's default is to search first for shared libraries and then for static libraries.
86
+ # Todo later: add option to prefer static libs over dynamic:
87
+ find_library(${_component}_LIBRARIES NAMES ${_library} lib${_library}.a
88
+ HINTS
89
+ ${PC_${_component}_LIBDIR}
90
+ ${PC_${_component}_LIBRARY_DIRS}
91
+ )
92
+
93
+ set(${_component}_DEFINITIONS ${PC_${_component}_CFLAGS_OTHER} CACHE STRING "The ${_component} CFLAGS.")
94
+ set(${_component}_VERSION ${PC_${_component}_VERSION} CACHE STRING "The ${_component} version number.")
95
+
96
+ set_component_found(${_component})
97
+
98
+ mark_as_advanced(
99
+ ${_component}_INCLUDE_DIRS
100
+ ${_component}_LIBRARIES
101
+ ${_component}_DEFINITIONS
102
+ ${_component}_VERSION)
103
+
104
+ endmacro()
105
+
106
+
107
+ # Check for cached results. If there are skip the costly part.
108
+ if (NOT FFMPEG_LIBRARIES)
109
+
110
+ # Check for all possible component.
111
+ find_component(AVCODEC libavcodec avcodec libavcodec/avcodec.h)
112
+ find_component(AVFORMAT libavformat avformat libavformat/avformat.h)
113
+ find_component(AVDEVICE libavdevice avdevice libavdevice/avdevice.h)
114
+ #find_component(AVRESAMPLE libavresample avresample libavresample/avresample.h) # old name for swresample
115
+ find_component(AVUTIL libavutil avutil libavutil/avutil.h)
116
+ find_component(AVFILTER libavfilter avfilter libavfilter/avfilter.h)
117
+ find_component(SWSCALE libswscale swscale libswscale/swscale.h)
118
+ find_component(POSTPROC libpostproc postproc libpostproc/postprocess.h)
119
+ find_component(SWRESAMPLE libswresample swresample libswresample/swresample.h)
120
+
121
+ # Check if the required components were found and add their stuff to the FFMPEG_* vars.
122
+ foreach (_component ${FFmpeg_FIND_COMPONENTS})
123
+ if (${_component}_FOUND)
124
+ # message(STATUS "Required component ${_component} present.")
125
+ set(FFMPEG_LIBRARIES ${FFMPEG_LIBRARIES} ${${_component}_LIBRARIES})
126
+ set(FFMPEG_DEFINITIONS ${FFMPEG_DEFINITIONS} ${${_component}_DEFINITIONS})
127
+ list(APPEND FFMPEG_INCLUDE_DIRS ${${_component}_INCLUDE_DIRS})
128
+ else ()
129
+ # message(STATUS "Required component ${_component} missing.")
130
+ endif ()
131
+ endforeach ()
132
+
133
+ # Build the include path with duplicates removed.
134
+ if (FFMPEG_INCLUDE_DIRS)
135
+ list(REMOVE_DUPLICATES FFMPEG_INCLUDE_DIRS)
136
+ endif ()
137
+
138
+ # cache the vars.
139
+ set(FFMPEG_INCLUDE_DIRS ${FFMPEG_INCLUDE_DIRS} CACHE STRING "The FFmpeg include directories." FORCE)
140
+ set(FFMPEG_LIBRARIES ${FFMPEG_LIBRARIES} CACHE STRING "The FFmpeg libraries." FORCE)
141
+ set(FFMPEG_DEFINITIONS ${FFMPEG_DEFINITIONS} CACHE STRING "The FFmpeg cflags." FORCE)
142
+
143
+ mark_as_advanced(FFMPEG_INCLUDE_DIRS
144
+ FFMPEG_LIBRARIES
145
+ FFMPEG_DEFINITIONS)
146
+
147
+ endif ()
148
+
149
+ # Now set the noncached _FOUND vars for the components.
150
+ # whisper.cpp does not need SWSCALE
151
+ foreach (_component AVCODEC AVDEVICE AVFORMAT AVRESAMPLE AVUTIL POSTPROCESS)
152
+ set_component_found(${_component})
153
+ endforeach ()
154
+
155
+ # Compile the list of required vars
156
+ set(_FFmpeg_REQUIRED_VARS FFMPEG_LIBRARIES FFMPEG_INCLUDE_DIRS)
157
+ foreach (_component ${FFmpeg_FIND_COMPONENTS})
158
+ list(APPEND _FFmpeg_REQUIRED_VARS ${_component}_LIBRARIES ${_component}_INCLUDE_DIRS)
159
+ endforeach ()
160
+
161
+ # Give a nice error message if some of the required vars are missing.
162
+ find_package_handle_standard_args(FFmpeg DEFAULT_MSG ${_FFmpeg_REQUIRED_VARS})
163
+
examples/CMakeLists.txt CHANGED
@@ -22,6 +22,10 @@ endif()
22
 
23
  set(TARGET common)
24
 
 
 
 
 
25
  add_library(${TARGET} STATIC
26
  common.h
27
  common.cpp
@@ -29,6 +33,7 @@ add_library(${TARGET} STATIC
29
  common-ggml.cpp
30
  grammar-parser.h
31
  grammar-parser.cpp
 
32
  )
33
 
34
  include(DefaultTargetOptions)
 
22
 
23
  set(TARGET common)
24
 
25
+ if (WHISPER_FFMPEG)
26
+ set(COMMON_SOURCES_FFMPEG ffmpeg-transcode.cpp)
27
+ endif()
28
+
29
  add_library(${TARGET} STATIC
30
  common.h
31
  common.cpp
 
33
  common-ggml.cpp
34
  grammar-parser.h
35
  grammar-parser.cpp
36
+ ${COMMON_SOURCES_FFMPEG}
37
  )
38
 
39
  include(DefaultTargetOptions)
examples/common.cpp CHANGED
@@ -24,6 +24,11 @@
24
  #include <io.h>
25
  #endif
26
 
 
 
 
 
 
27
  // Function to check if the next argument exists
28
  std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
29
  if (i + 1 < argc && argv[i + 1][0] != '-') {
@@ -637,7 +642,7 @@ bool is_wav_buffer(const std::string buf) {
637
 
638
  bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
639
  drwav wav;
640
- std::vector<uint8_t> wav_data; // used for pipe input from stdin
641
 
642
  if (fname == "-") {
643
  {
@@ -670,8 +675,19 @@ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector
670
  }
671
  }
672
  else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
 
 
 
 
 
 
 
 
 
 
673
  fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
674
  return false;
 
675
  }
676
 
677
  if (wav.channels != 1 && wav.channels != 2) {
 
24
  #include <io.h>
25
  #endif
26
 
27
+ #ifdef WHISPER_FFMPEG
28
+ // as implemented in ffmpeg_trancode.cpp only embedded in common lib if whisper built with ffmpeg support
29
+ extern bool ffmpeg_decode_audio(const std::string & ifname, std::vector<uint8_t> & wav_data);
30
+ #endif
31
+
32
  // Function to check if the next argument exists
33
  std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
34
  if (i + 1 < argc && argv[i + 1][0] != '-') {
 
642
 
643
  bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
644
  drwav wav;
645
+ std::vector<uint8_t> wav_data; // used for pipe input from stdin or ffmpeg decoding output
646
 
647
  if (fname == "-") {
648
  {
 
675
  }
676
  }
677
  else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
678
+ #if defined(WHISPER_FFMPEG)
679
+ if (ffmpeg_decode_audio(fname, wav_data) != 0) {
680
+ fprintf(stderr, "error: failed to ffmpeg decode '%s' \n", fname.c_str());
681
+ return false;
682
+ }
683
+ if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
684
+ fprintf(stderr, "error: failed to read wav data as wav \n");
685
+ return false;
686
+ }
687
+ #else
688
  fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
689
  return false;
690
+ #endif
691
  }
692
 
693
  if (wav.channels != 1 && wav.channels != 2) {
examples/ffmpeg-transcode.cpp ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* SPDX-License-Identifier: GPL-2.0 */
2
+
3
+ /*
4
+ * transcode.c - convert audio file to WAVE
5
+ *
6
+ * Copyright (C) 2019 Andrew Clayton <[email protected]>
7
+ * Copyright (C) 2024 William Tambellini <[email protected]>
8
+ */
9
+
10
+ // Just for conveninent C++ API
11
+ #include <vector>
12
+ #include <string>
13
+
14
+ // C
15
+ #include <stdio.h>
16
+ #include <stdlib.h>
17
+ #include <string.h>
18
+ #include <stdbool.h>
19
+ #include <stdint.h>
20
+ #include <sys/types.h>
21
+ #include <sys/stat.h>
22
+ #include <fcntl.h>
23
+ #include <unistd.h>
24
+ #include <sys/mman.h>
25
+
26
+ extern "C" {
27
+ #include <libavutil/opt.h>
28
+ #include <libavcodec/avcodec.h>
29
+ #include <libavformat/avformat.h>
30
+ #include <libswresample/swresample.h>
31
+ }
32
+
33
+ typedef uint64_t u64;
34
+ typedef int64_t s64;
35
+ typedef uint32_t u32;
36
+ typedef int32_t s32;
37
+ typedef uint16_t u16;
38
+ typedef int16_t s16;
39
+ typedef uint8_t u8;
40
+ typedef int8_t s8;
41
+
42
+ #define WAVE_SAMPLE_RATE 16000
43
+ #define AVIO_CTX_BUF_SZ 4096
44
+
45
+ static const char* ffmpegLog = getenv("FFMPEG_LOG");
46
+ // Todo: add __FILE__ __LINE__
47
+ #define LOG(...) \
48
+ do { if (ffmpegLog) fprintf(stderr, __VA_ARGS__); } while(0) // C99
49
+
50
+ /*
51
+ * WAVE file header based on definition from
52
+ * https://gist.github.com/Jon-Schneider/8b7c53d27a7a13346a643dac9c19d34f
53
+ *
54
+ * We must ensure this structure doesn't have any holes or
55
+ * padding so we can just map it straight to the WAVE data.
56
+ */
57
+ struct wave_hdr {
58
+ /* RIFF Header: "RIFF" */
59
+ char riff_header[4];
60
+ /* size of audio data + sizeof(struct wave_hdr) - 8 */
61
+ int wav_size;
62
+ /* "WAVE" */
63
+ char wav_header[4];
64
+
65
+ /* Format Header */
66
+ /* "fmt " (includes trailing space) */
67
+ char fmt_header[4];
68
+ /* Should be 16 for PCM */
69
+ int fmt_chunk_size;
70
+ /* Should be 1 for PCM. 3 for IEEE Float */
71
+ s16 audio_format;
72
+ s16 num_channels;
73
+ int sample_rate;
74
+ /*
75
+ * Number of bytes per second
76
+ * sample_rate * num_channels * bit_depth/8
77
+ */
78
+ int byte_rate;
79
+ /* num_channels * bytes per sample */
80
+ s16 sample_alignment;
81
+ /* bits per sample */
82
+ s16 bit_depth;
83
+
84
+ /* Data Header */
85
+ /* "data" */
86
+ char data_header[4];
87
+ /*
88
+ * size of audio
89
+ * number of samples * num_channels * bit_depth/8
90
+ */
91
+ int data_bytes;
92
+ } __attribute__((__packed__));
93
+
94
+ struct audio_buffer {
95
+ u8 *ptr;
96
+ int size; /* size left in the buffer */
97
+ };
98
+
99
+ static void set_wave_hdr(wave_hdr& wh, size_t size) {
100
+ memcpy(&wh.riff_header, "RIFF", 4);
101
+ wh.wav_size = size + sizeof(struct wave_hdr) - 8;
102
+ memcpy(&wh.wav_header, "WAVE", 4);
103
+ memcpy(&wh.fmt_header, "fmt ", 4);
104
+ wh.fmt_chunk_size = 16;
105
+ wh.audio_format = 1;
106
+ wh.num_channels = 1;
107
+ wh.sample_rate = WAVE_SAMPLE_RATE;
108
+ wh.sample_alignment = 2;
109
+ wh.bit_depth = 16;
110
+ wh.byte_rate = wh.sample_rate * wh.sample_alignment;
111
+ memcpy(&wh.data_header, "data", 4);
112
+ wh.data_bytes = size;
113
+ }
114
+
115
+ static void write_wave_hdr(int fd, size_t size) {
116
+ struct wave_hdr wh;
117
+ set_wave_hdr(wh, size);
118
+ write(fd, &wh, sizeof(struct wave_hdr));
119
+ }
120
+
121
+ static int map_file(int fd, u8 **ptr, size_t *size)
122
+ {
123
+ struct stat sb;
124
+
125
+ fstat(fd, &sb);
126
+ *size = sb.st_size;
127
+
128
+ *ptr = (u8*)mmap(NULL, *size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
129
+ if (*ptr == MAP_FAILED) {
130
+ perror("mmap");
131
+ return -1;
132
+ }
133
+
134
+ return 0;
135
+ }
136
+
137
+ static int read_packet(void *opaque, u8 *buf, int buf_size)
138
+ {
139
+ struct audio_buffer *audio_buf = (audio_buffer*)opaque;
140
+
141
+ buf_size = FFMIN(buf_size, audio_buf->size);
142
+
143
+ /* copy internal buffer data to buf */
144
+ memcpy(buf, audio_buf->ptr, buf_size);
145
+ audio_buf->ptr += buf_size;
146
+ audio_buf->size -= buf_size;
147
+
148
+ return buf_size;
149
+ }
150
+
151
+ static void convert_frame(struct SwrContext *swr, AVCodecContext *codec,
152
+ AVFrame *frame, s16 **data, int *size, bool flush)
153
+ {
154
+ int nr_samples;
155
+ s64 delay;
156
+ u8 *buffer;
157
+
158
+ delay = swr_get_delay(swr, codec->sample_rate);
159
+ nr_samples = av_rescale_rnd(delay + frame->nb_samples,
160
+ WAVE_SAMPLE_RATE, codec->sample_rate,
161
+ AV_ROUND_UP);
162
+ av_samples_alloc(&buffer, NULL, 1, nr_samples, AV_SAMPLE_FMT_S16, 0);
163
+
164
+ /*
165
+ * !flush is used to check if we are flushing any remaining
166
+ * conversion buffers...
167
+ */
168
+ nr_samples = swr_convert(swr, &buffer, nr_samples,
169
+ !flush ? (const u8 **)frame->data : NULL,
170
+ !flush ? frame->nb_samples : 0);
171
+
172
+ *data = (s16*)realloc(*data, (*size + nr_samples) * sizeof(s16));
173
+ memcpy(*data + *size, buffer, nr_samples * sizeof(s16));
174
+ *size += nr_samples;
175
+ av_freep(&buffer);
176
+ }
177
+
178
+ static bool is_audio_stream(const AVStream *stream)
179
+ {
180
+ if (stream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO)
181
+ return true;
182
+
183
+ return false;
184
+ }
185
+
186
+ // Return non zero on error, 0 on success
187
+ // audio_buffer: input memory
188
+ // data: decoded output audio data (wav file)
189
+ // size: size of output data
190
+ static int decode_audio(struct audio_buffer *audio_buf, s16 **data, int *size)
191
+ {
192
+ LOG("decode_audio: input size: %d\n", audio_buf->size);
193
+ AVFormatContext *fmt_ctx;
194
+ AVIOContext *avio_ctx;
195
+ AVStream *stream;
196
+ AVCodecContext *codec;
197
+ AVPacket packet;
198
+ AVFrame *frame;
199
+ struct SwrContext *swr;
200
+ u8 *avio_ctx_buffer;
201
+ unsigned int i;
202
+ int stream_index = -1;
203
+ int err;
204
+ const size_t errbuffsize = 1024;
205
+ char errbuff[errbuffsize];
206
+
207
+ av_register_all(); // from avformat. Still a must-have call for ffmpeg v3! (can be skipped for later versions)
208
+
209
+ fmt_ctx = avformat_alloc_context();
210
+ avio_ctx_buffer = (u8*)av_malloc(AVIO_CTX_BUF_SZ);
211
+ LOG("Creating an avio context: AVIO_CTX_BUF_SZ=%d\n", AVIO_CTX_BUF_SZ);
212
+ avio_ctx = avio_alloc_context(avio_ctx_buffer, AVIO_CTX_BUF_SZ, 0, audio_buf, &read_packet, NULL, NULL);
213
+ fmt_ctx->pb = avio_ctx;
214
+
215
+ // open the input stream and read header
216
+ err = avformat_open_input(&fmt_ctx, NULL, NULL, NULL);
217
+ if (err) {
218
+ LOG("Could not read audio buffer: %d: %s\n", err, av_make_error_string(errbuff, errbuffsize, err));
219
+ return err;
220
+ }
221
+
222
+ err = avformat_find_stream_info(fmt_ctx, NULL);
223
+ if (err < 0) {
224
+ LOG("Could not retrieve stream info from audio buffer: %d\n", err);
225
+ return err;
226
+ }
227
+
228
+ for (i = 0; i < fmt_ctx->nb_streams; i++) {
229
+ if (is_audio_stream(fmt_ctx->streams[i])) {
230
+ stream_index = i;
231
+ break;
232
+ }
233
+ }
234
+
235
+ if (stream_index == -1) {
236
+ LOG("Could not retrieve audio stream from buffer\n");
237
+ return -1;
238
+ }
239
+
240
+ stream = fmt_ctx->streams[stream_index];
241
+ codec = avcodec_alloc_context3(
242
+ avcodec_find_decoder(stream->codecpar->codec_id));
243
+ avcodec_parameters_to_context(codec, stream->codecpar);
244
+ err = avcodec_open2(codec, avcodec_find_decoder(codec->codec_id),
245
+ NULL);
246
+ if (err) {
247
+ LOG("Failed to open decoder for stream #%d in audio buffer\n", stream_index);
248
+ return err;
249
+ }
250
+
251
+ /* prepare resampler */
252
+ swr = swr_alloc();
253
+
254
+ av_opt_set_int(swr, "in_channel_count", codec->channels, 0);
255
+ av_opt_set_int(swr, "out_channel_count", 1, 0);
256
+ av_opt_set_int(swr, "in_channel_layout", codec->channel_layout, 0);
257
+ av_opt_set_int(swr, "out_channel_layout", AV_CH_LAYOUT_MONO, 0);
258
+ av_opt_set_int(swr, "in_sample_rate", codec->sample_rate, 0);
259
+ av_opt_set_int(swr, "out_sample_rate", WAVE_SAMPLE_RATE, 0);
260
+ av_opt_set_sample_fmt(swr, "in_sample_fmt", codec->sample_fmt, 0);
261
+ av_opt_set_sample_fmt(swr, "out_sample_fmt", AV_SAMPLE_FMT_S16, 0);
262
+
263
+ swr_init(swr);
264
+ if (!swr_is_initialized(swr)) {
265
+ LOG("Resampler has not been properly initialized\n");
266
+ return -1;
267
+ }
268
+
269
+ av_init_packet(&packet);
270
+ frame = av_frame_alloc();
271
+ if (!frame) {
272
+ LOG("Error allocating the frame\n");
273
+ return -1;
274
+ }
275
+
276
+ /* iterate through frames */
277
+ *data = NULL;
278
+ *size = 0;
279
+ while (av_read_frame(fmt_ctx, &packet) >= 0) {
280
+ avcodec_send_packet(codec, &packet);
281
+
282
+ err = avcodec_receive_frame(codec, frame);
283
+ if (err == AVERROR(EAGAIN))
284
+ continue;
285
+
286
+ convert_frame(swr, codec, frame, data, size, false);
287
+ }
288
+ /* Flush any remaining conversion buffers... */
289
+ convert_frame(swr, codec, frame, data, size, true);
290
+
291
+ av_frame_free(&frame);
292
+ swr_free(&swr);
293
+ //avio_context_free(); // todo?
294
+ avcodec_close(codec);
295
+ avformat_close_input(&fmt_ctx);
296
+ avformat_free_context(fmt_ctx);
297
+
298
+ if (avio_ctx) {
299
+ av_freep(&avio_ctx->buffer);
300
+ av_freep(&avio_ctx);
301
+ }
302
+
303
+ return 0;
304
+ }
305
+
306
+ // in mem decoding/conversion/resampling:
307
+ // ifname: input file path
308
+ // owav_data: in mem wav file. Can be forwarded as it to whisper/drwav
309
+ // return 0 on success
310
+ int ffmpeg_decode_audio(const std::string &ifname, std::vector<uint8_t>& owav_data) {
311
+ LOG("ffmpeg_decode_audio: %s\n", ifname.c_str());
312
+ int ifd = open(ifname.c_str(), O_RDONLY);
313
+ if (ifd == -1) {
314
+ fprintf(stderr, "Couldn't open input file %s\n", ifname.c_str());
315
+ return -1;
316
+ }
317
+ u8 *ibuf = NULL;
318
+ size_t ibuf_size;
319
+ int err = map_file(ifd, &ibuf, &ibuf_size);
320
+ if (err) {
321
+ LOG("Couldn't map input file %s\n", ifname.c_str());
322
+ return err;
323
+ }
324
+ LOG("Mapped input file: %x size: %d\n", ibuf, ibuf_size);
325
+ struct audio_buffer inaudio_buf;
326
+ inaudio_buf.ptr = ibuf;
327
+ inaudio_buf.size = ibuf_size;
328
+
329
+ s16 *odata=NULL;
330
+ int osize=0;
331
+
332
+ err = decode_audio(&inaudio_buf, &odata, &osize);
333
+ LOG("decode_audio returned %d \n", err);
334
+ if (err != 0) {
335
+ LOG("decode_audio failed\n");
336
+ return err;
337
+ }
338
+ LOG("decode_audio output size: %d\n", osize);
339
+
340
+ wave_hdr wh;
341
+ const size_t outdatasize = osize * sizeof(s16);
342
+ set_wave_hdr(wh, outdatasize);
343
+ owav_data.resize(sizeof(wave_hdr) + outdatasize);
344
+ // header:
345
+ memcpy(owav_data.data(), &wh, sizeof(wave_hdr));
346
+ // the data:
347
+ memcpy(owav_data.data() + sizeof(wave_hdr), odata, osize* sizeof(s16));
348
+
349
+ return 0;
350
+ }
examples/main/CMakeLists.txt CHANGED
@@ -3,4 +3,4 @@ add_executable(${TARGET} main.cpp)
3
 
4
  include(DefaultTargetOptions)
5
 
6
- target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
 
3
 
4
  include(DefaultTargetOptions)
5
 
6
+ target_link_libraries(${TARGET} PRIVATE common whisper ${FFMPEG_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
samples/.gitignore CHANGED
@@ -1 +1,4 @@
1
  *
 
 
 
 
1
  *
2
+ !jfk.wave
3
+ !jfk.mp3
4
+
samples/jfk.mp3 ADDED
Binary file (76.4 kB). View file
 
tests/CMakeLists.txt CHANGED
@@ -74,3 +74,14 @@ add_test(NAME ${TEST_TARGET}
74
  -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-large.bin
75
  -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
76
  set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "large")
 
 
 
 
 
 
 
 
 
 
 
 
74
  -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-large.bin
75
  -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
76
  set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "large")
77
+
78
+ if (WHISPER_FFMPEG)
79
+ set(TEST_TARGET test-main-tiny-mp3)
80
+ # Check with reviewers: any way to check the output transcription via ctest (diff, ...)?
81
+ add_test(NAME ${TEST_TARGET}
82
+ COMMAND $<TARGET_FILE:main>
83
+ -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-tiny.en.bin
84
+ -f ${PROJECT_SOURCE_DIR}/samples/jfk.mp3)
85
+ set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "tiny;mp3")
86
+ endif()
87
+