libobs: Fix potential output audio/video sync bug

Problem: When an output is started with encoders that have already been started by another output, and it starts in between the window in between where the first audio packets start coming in and where the first video packet comes in, the output would get audio packets with timestamps potentially much later than the first video frame when the first video frame finally comes in. The audio/video encoders will almost always have a differing delay. Solution: Detect that starting window, and if within that starting window, wait for a new keyframe from video instead of trying to sync up video. Additional Notes: In these cases where an output starts with already-active encoders, this patch also reduces the potential sync offset between the first video frame and the first audio frame. Before, it would sync the first video frame with the first audio frames right after that, but now it syncs with the closest audio frame in the interleaved array, which halves the potential sync difference between the first video frame and the first audio frame of already-active encoders. (So now the potential sync difference is roughly 11.6 milliseconds at 44.1khz audio, where it was 23.2 before)
2016-01-25 02:41:02 -08:00
parent c4657da2f1
commit 65eb3c0815
1 changed files with 110 additions and 23 deletions
--- a/libobs/obs-output.c
+++ b/libobs/obs-output.c
@@ -877,46 +877,97 @@ static inline void set_higher_ts(struct obs_output *output,
 	}
 }

-static bool can_prune_interleaved_packet(struct obs_output *output, size_t idx)
+static struct encoder_packet *find_first_packet_type(struct obs_output *output,
+		enum obs_encoder_type type, size_t audio_idx);
+
+/* gets the point where audio and video are closest together */
+static size_t get_interleaved_start_idx(struct obs_output *output)
+{
+	int64_t closest_diff = 0x7FFFFFFFFFFFFFFFLL;
+	struct encoder_packet *first_video = find_first_packet_type(output,
+			OBS_ENCODER_VIDEO, 0);
+	size_t video_idx = DARRAY_INVALID;
+	size_t idx = 0;
+
+	for (size_t i = 0; i < output->interleaved_packets.num; i++) {
+		struct encoder_packet *packet =
+			&output->interleaved_packets.array[i];
+		int64_t diff;
+
+		if (packet->type != OBS_ENCODER_AUDIO) {
+			if (packet == first_video)
+				video_idx = i;
+			continue;
+		}
+
+		diff = llabs(packet->dts_usec - first_video->dts_usec);
+		if (diff < closest_diff) {
+			closest_diff = diff;
+			idx = i;
+		}
+	}
+
+	return video_idx < idx ? video_idx : idx;
+}
+
+static bool prune_premature_video_packet(struct obs_output *output)
 {
 	struct encoder_packet *packet;
 	struct encoder_packet *next;
+	int64_t duration_usec;
+	int64_t diff;

-	if (idx >= (output->interleaved_packets.num - 1))
+	if (output->interleaved_packets.num < 2)
 		return false;

-	packet = &output->interleaved_packets.array[idx];
+	packet = &output->interleaved_packets.array[0];

-	/* audio packets will almost always come before video packets,
-	 * so it should only ever be necessary to prune audio packets */
-	if (packet->type != OBS_ENCODER_AUDIO)
+	if (packet->type == OBS_ENCODER_AUDIO)
 		return false;

-	next = &output->interleaved_packets.array[idx + 1];
+	next = &output->interleaved_packets.array[1];
+	diff = next->dts_usec - packet->dts_usec;
+	duration_usec = packet->timebase_num * 1000000LL / packet->timebase_den;

-	if (next->type == OBS_ENCODER_VIDEO &&
-	    next->dts_usec == packet->dts_usec)
-		return false;
-
-	return true;
+	return diff > duration_usec;
 }

+static void discard_to_idx(struct obs_output *output, size_t idx)
+{
+	for (size_t i = 0; i < idx; i++) {
+		struct encoder_packet *packet =
+			&output->interleaved_packets.array[i];
+		obs_free_encoder_packet(packet);
+	}
+
+	da_erase_range(output->interleaved_packets, 0, idx);
+}
+
+#define DEBUG_STARTING_PACKETS 0
+
 static void prune_interleaved_packets(struct obs_output *output)
 {
 	size_t start_idx = 0;

-	while (can_prune_interleaved_packet(output, start_idx))
-		start_idx++;
-
-	if (start_idx) {
-		for (size_t i = 0; i < start_idx; i++) {
-			struct encoder_packet *packet =
-				&output->interleaved_packets.array[i];
-			obs_free_encoder_packet(packet);
-		}
-
-		da_erase_range(output->interleaved_packets, 0, start_idx);
+#if DEBUG_STARTING_PACKETS == 1
+	blog(LOG_DEBUG, "--------- Pruning! ---------");
+	for (size_t i = 0; i < output->interleaved_packets.num; i++) {
+		struct encoder_packet *packet =
+			&output->interleaved_packets.array[i];
+		blog(LOG_DEBUG, "packet: %s, ts: %lld",
+				packet->type == OBS_ENCODER_AUDIO ?
+				"audio" : "video", packet->dts_usec);
 	}
+#endif
+
+	/* prunes the first video packet if it's too far away from audio */
+	if (prune_premature_video_packet(output))
+		start_idx++;
+	else
+		start_idx = get_interleaved_start_idx(output);
+
+	if (start_idx)
+		discard_to_idx(output, start_idx);
 }

 static struct encoder_packet *find_first_packet_type(struct obs_output *output,
@@ -966,6 +1017,16 @@ static bool initialize_interleaved_packets(struct obs_output *output)
 	for (size_t i = 0; i < audio_mixes; i++)
 		output->audio_offsets[i] = audio[i]->dts;

+#if DEBUG_STARTING_PACKETS == 1
+	int64_t v = video->dts_usec;
+	int64_t a = audio[0]->dts_usec;
+	int64_t diff = v - a;
+
+	blog(LOG_DEBUG, "output '%s' offset for video: %lld, audio: %lld, "
+			"diff: %lldms", output->context.name, v, a,
+			diff / 1000LL);
+#endif
+
 	/* subtract offsets from highest TS offset variables */
 	output->highest_audio_ts -= audio[0]->dts_usec;
 	output->highest_video_ts -= video->dts_usec;
@@ -1009,6 +1070,23 @@ static void resort_interleaved_packets(struct obs_output *output)
 	da_free(old_array);
 }

+static void discard_unused_audio_packets(struct obs_output *output,
+		int64_t dts_usec)
+{
+	size_t idx = 0;
+
+	for (; idx < output->interleaved_packets.num; idx++) {
+		struct encoder_packet *p =
+			&output->interleaved_packets.array[idx];
+
+		if (p->dts_usec >= dts_usec)
+			break;
+	}
+
+	if (idx)
+		discard_to_idx(output, idx);
+}
+
 static void interleave_packets(void *data, struct encoder_packet *packet)
 {
 	struct obs_output     *output = data;
@@ -1020,6 +1098,15 @@ static void interleave_packets(void *data, struct encoder_packet *packet)

 	pthread_mutex_lock(&output->interleaved_mutex);

+	/* if first video frame is not a keyframe, discard until received */
+	if (!output->received_video &&
+	    packet->type == OBS_ENCODER_VIDEO &&
+	    !packet->keyframe) {
+		discard_unused_audio_packets(output, packet->dts_usec);
+		pthread_mutex_unlock(&output->interleaved_mutex);
+		return;
+	}
+
 	was_started = output->received_audio && output->received_video;

 	if (output->active_delay_ns)