Files
2026-04-07 09:58:54 +02:00

307 lines
10 KiB
Plaintext

// Sources/SubcodecObjC/SCVideoToolboxDecoder.mm
#import "SCVideoToolboxDecoder.h"
#include "AnnexBSplitter.h"
#import <VideoToolbox/VideoToolbox.h>
#import <CoreMedia/CoreMedia.h>
#import <CoreVideo/CoreVideo.h>
#include <vector>
static NSString* const kErrorDomain = @"SCVideoToolboxDecoder";
static NSError* makeVTError(NSString* msg, OSStatus status) {
return [NSError errorWithDomain:kErrorDomain code:status
userInfo:@{NSLocalizedDescriptionKey:
[NSString stringWithFormat:@"%@ (OSStatus %d)", msg, (int)status]}];
}
static NSError* makeError(NSString* msg) {
return [NSError errorWithDomain:kErrorDomain code:-1
userInfo:@{NSLocalizedDescriptionKey: msg}];
}
// Returns the LAST occurrence of the target NAL type in the data.
// This is important because buildStream() prepends subcodec's SPS/PPS before
// OpenH264's SPS/PPS, and we need OpenH264's (the last ones) for VideoToolbox.
static const uint8_t* findNAL(const uint8_t* data, size_t size,
uint8_t targetType, size_t* nalSize) {
const uint8_t* found = nullptr;
size_t foundSize = 0;
for (size_t i = 0; i + 3 < size; ) {
int sc_len = 0;
if (i + 3 < size && data[i]==0 && data[i+1]==0 && data[i+2]==0 && data[i+3]==1)
sc_len = 4;
else if (i + 2 < size && data[i]==0 && data[i+1]==0 && data[i+2]==1)
sc_len = 3;
if (sc_len > 0) {
const uint8_t* nalStart = data + i + sc_len;
uint8_t nal_type = nalStart[0] & 0x1F;
size_t nalEnd = size;
for (size_t j = i + sc_len + 1; j + 2 < size; j++) {
if (data[j]==0 && data[j+1]==0 &&
(data[j+2]==1 || (j + 3 < size && data[j+2]==0 && data[j+3]==1))) {
nalEnd = j;
break;
}
}
if (nal_type == targetType) {
found = nalStart;
foundSize = nalEnd - (i + sc_len);
}
i = nalEnd;
} else {
i++;
}
}
if (found) {
*nalSize = foundSize;
}
return found;
}
static std::vector<uint8_t> annexBToAVCC(const uint8_t* data, size_t size) {
std::vector<uint8_t> avcc;
avcc.reserve(size);
for (size_t i = 0; i + 3 < size; ) {
int sc_len = 0;
if (i + 3 < size && data[i]==0 && data[i+1]==0 && data[i+2]==0 && data[i+3]==1)
sc_len = 4;
else if (i + 2 < size && data[i]==0 && data[i+1]==0 && data[i+2]==1)
sc_len = 3;
if (sc_len > 0) {
size_t nalStart = i + sc_len;
size_t nalEnd = size;
for (size_t j = nalStart + 1; j + 2 < size; j++) {
if (data[j]==0 && data[j+1]==0 &&
(data[j+2]==1 || (j + 3 < size && data[j+2]==0 && data[j+3]==1))) {
nalEnd = j;
break;
}
}
uint8_t nal_type = data[nalStart] & 0x1F;
if (nal_type != 7 && nal_type != 8) {
uint32_t nalLen = (uint32_t)(nalEnd - nalStart);
uint8_t lenBuf[4] = {
(uint8_t)(nalLen >> 24), (uint8_t)(nalLen >> 16),
(uint8_t)(nalLen >> 8), (uint8_t)(nalLen)
};
avcc.insert(avcc.end(), lenBuf, lenBuf + 4);
avcc.insert(avcc.end(), data + nalStart, data + nalEnd);
}
i = nalEnd;
} else {
i++;
}
}
return avcc;
}
struct DecodeContext {
NSMutableArray<SCDecodedFrame *>* frames;
};
static void decompressionCallback(void* decompressionOutputRefCon,
void* sourceFrameRefCon,
OSStatus status,
VTDecodeInfoFlags infoFlags,
CVImageBufferRef imageBuffer,
CMTime presentationTimeStamp,
CMTime presentationDuration) {
if (status != noErr || !imageBuffer) return;
DecodeContext* ctx = (DecodeContext*)decompressionOutputRefCon;
CVPixelBufferRef pixelBuffer = (CVPixelBufferRef)imageBuffer;
CVPixelBufferLockBaseAddress(pixelBuffer, kCVPixelBufferLock_ReadOnly);
int w = (int)CVPixelBufferGetWidth(pixelBuffer);
int h = (int)CVPixelBufferGetHeight(pixelBuffer);
OSType pixelFormat = CVPixelBufferGetPixelFormatType(pixelBuffer);
NSMutableData* yData = [NSMutableData dataWithLength:w * h];
NSMutableData* cbData = [NSMutableData dataWithLength:(w / 2) * (h / 2)];
NSMutableData* crData = [NSMutableData dataWithLength:(w / 2) * (h / 2)];
uint8_t* yDst = (uint8_t*)yData.mutableBytes;
uint8_t* cbDst = (uint8_t*)cbData.mutableBytes;
uint8_t* crDst = (uint8_t*)crData.mutableBytes;
if (pixelFormat == kCVPixelFormatType_420YpCbCr8BiPlanarVideoRange ||
pixelFormat == kCVPixelFormatType_420YpCbCr8BiPlanarFullRange) {
uint8_t* yPlane = (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pixelBuffer, 0);
size_t yStride = CVPixelBufferGetBytesPerRowOfPlane(pixelBuffer, 0);
uint8_t* uvPlane = (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pixelBuffer, 1);
size_t uvStride = CVPixelBufferGetBytesPerRowOfPlane(pixelBuffer, 1);
for (int r = 0; r < h; r++)
memcpy(yDst + r * w, yPlane + r * yStride, w);
int cw = w / 2;
int ch = h / 2;
for (int r = 0; r < ch; r++) {
const uint8_t* uvRow = uvPlane + r * uvStride;
for (int c = 0; c < cw; c++) {
cbDst[r * cw + c] = uvRow[c * 2];
crDst[r * cw + c] = uvRow[c * 2 + 1];
}
}
} else if (pixelFormat == kCVPixelFormatType_420YpCbCr8Planar) {
for (int p = 0; p < 3; p++) {
uint8_t* src = (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pixelBuffer, p);
size_t stride = CVPixelBufferGetBytesPerRowOfPlane(pixelBuffer, p);
int pw = (p == 0) ? w : w / 2;
int ph = (p == 0) ? h : h / 2;
uint8_t* dst = (p == 0) ? yDst : (p == 1) ? cbDst : crDst;
for (int r = 0; r < ph; r++)
memcpy(dst + r * pw, src + r * stride, pw);
}
}
CVPixelBufferUnlockBaseAddress(pixelBuffer, kCVPixelBufferLock_ReadOnly);
SCDecodedFrame* frame = [[SCDecodedFrame alloc] initWithWidth:w height:h
y:yData cb:cbData cr:crData];
[ctx->frames addObject:frame];
}
@implementation SCVideoToolboxDecoder
+ (nullable SCVideoToolboxDecoder *)createDecoderWithError:(NSError **)error {
return [[SCVideoToolboxDecoder alloc] init];
}
- (nullable NSArray<SCDecodedFrame *> *)decodeStream:(NSData *)data
error:(NSError **)error {
const uint8_t* bytes = (const uint8_t*)data.bytes;
size_t length = data.length;
auto packets = split_annex_b_frames(bytes, length);
if (packets.empty()) {
if (error) *error = makeError(@"No frames found in stream");
return nil;
}
size_t spsSize = 0, ppsSize = 0;
const uint8_t* spsNAL = findNAL(packets[0].data, packets[0].size, 7, &spsSize);
const uint8_t* ppsNAL = findNAL(packets[0].data, packets[0].size, 8, &ppsSize);
if (!spsNAL || !ppsNAL) {
if (error) *error = makeError(@"SPS or PPS not found in stream");
return nil;
}
const uint8_t* paramSets[2] = { spsNAL, ppsNAL };
size_t paramSizes[2] = { spsSize, ppsSize };
CMVideoFormatDescriptionRef formatDesc = NULL;
OSStatus status = CMVideoFormatDescriptionCreateFromH264ParameterSets(
kCFAllocatorDefault, 2, paramSets, paramSizes, 4, &formatDesc);
if (status != noErr) {
if (error) *error = makeVTError(@"CMVideoFormatDescriptionCreateFromH264ParameterSets failed", status);
return nil;
}
DecodeContext ctx;
ctx.frames = [NSMutableArray array];
VTDecompressionOutputCallbackRecord callbackRecord;
callbackRecord.decompressionOutputCallback = decompressionCallback;
callbackRecord.decompressionOutputRefCon = &ctx;
NSDictionary* destAttrs = @{
(NSString*)kCVPixelBufferPixelFormatTypeKey:
@(kCVPixelFormatType_420YpCbCr8BiPlanarVideoRange),
};
VTDecompressionSessionRef session = NULL;
status = VTDecompressionSessionCreate(
kCFAllocatorDefault, formatDesc, NULL,
(__bridge CFDictionaryRef)destAttrs,
&callbackRecord, &session);
if (status != noErr) {
CFRelease(formatDesc);
if (error) *error = makeVTError(@"VTDecompressionSessionCreate failed", status);
return nil;
}
NSError* decodeError = nil;
for (auto& pkt : packets) {
auto avcc = annexBToAVCC(pkt.data, pkt.size);
if (avcc.empty()) continue;
CMBlockBufferRef blockBuf = NULL;
status = CMBlockBufferCreateWithMemoryBlock(
kCFAllocatorDefault, NULL, avcc.size(), kCFAllocatorDefault,
NULL, 0, avcc.size(), kCMBlockBufferAssureMemoryNowFlag, &blockBuf);
if (status != noErr) {
decodeError = makeVTError(@"CMBlockBufferCreateWithMemoryBlock failed", status);
break;
}
status = CMBlockBufferReplaceDataBytes(avcc.data(), blockBuf, 0, avcc.size());
if (status != noErr) {
CFRelease(blockBuf);
decodeError = makeVTError(@"CMBlockBufferReplaceDataBytes failed", status);
break;
}
CMSampleBufferRef sampleBuf = NULL;
size_t sampleSize = avcc.size();
status = CMSampleBufferCreate(
kCFAllocatorDefault, blockBuf, true, NULL, NULL,
formatDesc, 1, 0, NULL, 1, &sampleSize, &sampleBuf);
CFRelease(blockBuf);
if (status != noErr) {
decodeError = makeVTError(@"CMSampleBufferCreate failed", status);
break;
}
VTDecodeInfoFlags flagsOut = 0;
status = VTDecompressionSessionDecodeFrame(
session, sampleBuf,
kVTDecodeFrame_1xRealTimePlayback,
NULL, &flagsOut);
CFRelease(sampleBuf);
if (status != noErr) {
decodeError = makeVTError(@"VTDecompressionSessionDecodeFrame failed", status);
break;
}
}
VTDecompressionSessionWaitForAsynchronousFrames(session);
VTDecompressionSessionInvalidate(session);
CFRelease(session);
CFRelease(formatDesc);
if (decodeError) {
if (error) *error = decodeError;
return nil;
}
return ctx.frames;
}
@end