obs-studio/libobs/media-io/video-io.c

589 lines
14 KiB
C
Raw Normal View History

2013-09-30 19:37:13 -07:00
/******************************************************************************
Copyright (C) 2013 by Hugh Bailey <obs.jim@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or
2013-09-30 19:37:13 -07:00
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
#include <assert.h>
#include <inttypes.h>
2013-09-30 19:37:13 -07:00
#include "../util/bmem.h"
#include "../util/platform.h"
#include "../util/profiler.h"
2013-09-30 19:37:13 -07:00
#include "../util/threading.h"
#include "../util/darray.h"
#include "../util/util_uint64.h"
2013-09-30 19:37:13 -07:00
#include "format-conversion.h"
2013-09-30 19:37:13 -07:00
#include "video-io.h"
#include "video-frame.h"
#include "video-scaler.h"
extern profiler_name_store_t *obs_get_profiler_name_store(void);
#define MAX_CONVERT_BUFFERS 3
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
#define MAX_CACHE_SIZE 16
struct cached_frame_info {
struct video_data frame;
int skipped;
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
int count;
};
2013-09-30 19:37:13 -07:00
struct video_input {
struct video_scale_info conversion;
video_scaler_t *scaler;
struct video_frame frame[MAX_CONVERT_BUFFERS];
int cur_frame;
Implement encoder interface (still preliminary) - Implement OBS encoder interface. It was previously incomplete, but now is reaching some level of completion, though probably should still be considered preliminary. I had originally implemented it so that encoders only have a 'reset' function to reset their parameters, but I felt that having both a 'start' and 'stop' function would be useful. Encoders are now assigned to a specific video/audio media output each rather than implicitely assigned to the main obs video/audio contexts. This allows separate encoder contexts that aren't necessarily assigned to the main video/audio context (which is useful for things such as recording specific sources). Will probably have to do this for regular obs outputs as well. When creating an encoder, you must now explicitely state whether that encoder is an audio or video encoder. Audio and video can optionally be automatically converted depending on what the encoder specifies. When something 'attaches' to an encoder, the first attachment starts the encoder, and the encoder automatically attaches to the media output context associated with it. Subsequent attachments won't have the same effect, they will just start receiving the same encoder data when the next keyframe plays (along with SEI if any). When detaching from the encoder, the last detachment will fully stop the encoder and detach the encoder from the media output context associated with the encoder. SEI must actually be exported separately; because new encoder attachments may not always be at the beginning of the stream, the first keyframe they get must have that SEI data in it. If the encoder has SEI data, it needs only add one small function to simply query that SEI data, and then that data will be handled automatically by libobs for all subsequent encoder attachments. - Implement x264 encoder plugin, move x264 files to separate plugin to separate necessary dependencies. - Change video/audio frame output structures to not use const qualifiers to prevent issues with non-const function usage elsewhere. This was an issue when writing the x264 encoder, as the x264 encoder expects non-const frame data. Change stagesurf_map to return a non-const data type to prevent this as well. - Change full range parameter of video scaler to be an enum rather than boolean
2014-03-16 16:21:34 -07:00
void (*callback)(void *param, struct video_data *frame);
void *param;
};
static inline void video_input_free(struct video_input *input)
{
for (size_t i = 0; i < MAX_CONVERT_BUFFERS; i++)
video_frame_free(&input->frame[i]);
video_scaler_destroy(input->scaler);
}
2013-09-30 19:37:13 -07:00
struct video_output {
struct video_output_info info;
2013-09-30 19:37:13 -07:00
pthread_t thread;
pthread_mutex_t data_mutex;
bool stop;
os_sem_t *update_semaphore;
uint64_t frame_time;
volatile long skipped_frames;
volatile long total_frames;
2013-09-30 19:37:13 -07:00
bool initialized;
pthread_mutex_t input_mutex;
DARRAY(struct video_input) inputs;
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
size_t available_frames;
size_t first_added;
size_t last_added;
struct cached_frame_info cache[MAX_CACHE_SIZE];
volatile bool raw_active;
volatile long gpu_refs;
2013-09-30 19:37:13 -07:00
};
/* ------------------------------------------------------------------------- */
static inline bool scale_video_output(struct video_input *input,
struct video_data *data)
{
bool success = true;
if (input->scaler) {
struct video_frame *frame;
if (++input->cur_frame == MAX_CONVERT_BUFFERS)
input->cur_frame = 0;
frame = &input->frame[input->cur_frame];
success = video_scaler_scale(input->scaler, frame->data,
frame->linesize,
(const uint8_t *const *)data->data,
data->linesize);
if (success) {
for (size_t i = 0; i < MAX_AV_PLANES; i++) {
data->data[i] = frame->data[i];
data->linesize[i] = frame->linesize[i];
}
} else {
blog(LOG_WARNING, "video-io: Could not scale frame!");
}
}
return success;
}
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
static inline bool video_output_cur_frame(struct video_output *video)
{
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
struct cached_frame_info *frame_info;
bool complete;
bool skipped;
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
/* -------------------------------- */
pthread_mutex_lock(&video->data_mutex);
frame_info = &video->cache[video->first_added];
pthread_mutex_unlock(&video->data_mutex);
/* -------------------------------- */
pthread_mutex_lock(&video->input_mutex);
for (size_t i = 0; i < video->inputs.num; i++) {
struct video_input *input = video->inputs.array + i;
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
struct video_data frame = frame_info->frame;
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
if (scale_video_output(input, &frame))
input->callback(input->param, &frame);
}
pthread_mutex_unlock(&video->input_mutex);
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
/* -------------------------------- */
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
pthread_mutex_lock(&video->data_mutex);
frame_info->frame.timestamp += video->frame_time;
complete = --frame_info->count == 0;
skipped = frame_info->skipped > 0;
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
if (complete) {
if (++video->first_added == video->info.cache_size)
video->first_added = 0;
if (++video->available_frames == video->info.cache_size)
video->last_added = video->first_added;
} else if (skipped) {
--frame_info->skipped;
os_atomic_inc_long(&video->skipped_frames);
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
}
pthread_mutex_unlock(&video->data_mutex);
/* -------------------------------- */
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
return complete;
}
2013-09-30 19:37:13 -07:00
static void *video_thread(void *param)
{
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
struct video_output *video = param;
2013-09-30 19:37:13 -07:00
os_set_thread_name("video-io: video thread");
const char *video_thread_name =
profile_store_name(obs_get_profiler_name_store(),
"video_thread(%s)", video->info.name);
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
while (os_sem_wait(video->update_semaphore) == 0) {
if (video->stop)
break;
profile_start(video_thread_name);
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
while (!video->stop && !video_output_cur_frame(video)) {
os_atomic_inc_long(&video->total_frames);
}
2013-09-30 19:37:13 -07:00
os_atomic_inc_long(&video->total_frames);
profile_end(video_thread_name);
profile_reenable_thread();
2013-09-30 19:37:13 -07:00
}
return NULL;
}
/* ------------------------------------------------------------------------- */
static inline bool valid_video_params(const struct video_output_info *info)
2013-09-30 19:37:13 -07:00
{
return info->height != 0 && info->width != 0 && info->fps_den != 0 &&
info->fps_num != 0;
2013-09-30 19:37:13 -07:00
}
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
static inline void init_cache(struct video_output *video)
{
if (video->info.cache_size > MAX_CACHE_SIZE)
video->info.cache_size = MAX_CACHE_SIZE;
for (size_t i = 0; i < video->info.cache_size; i++) {
struct video_frame *frame;
frame = (struct video_frame *)&video->cache[i];
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
video_frame_init(frame, video->info.format, video->info.width,
video->info.height);
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
}
video->available_frames = video->info.cache_size;
}
int video_output_open(video_t **video, struct video_output_info *info)
2013-09-30 19:37:13 -07:00
{
struct video_output *out;
if (!valid_video_params(info))
return VIDEO_OUTPUT_INVALIDPARAM;
out = bzalloc(sizeof(struct video_output));
if (!out)
goto fail0;
2013-09-30 19:37:13 -07:00
memcpy(&out->info, info, sizeof(struct video_output_info));
out->frame_time =
util_mul_div64(1000000000ULL, info->fps_den, info->fps_num);
2013-09-30 19:37:13 -07:00
out->initialized = false;
if (pthread_mutex_init_recursive(&out->data_mutex) != 0)
goto fail0;
if (pthread_mutex_init_recursive(&out->input_mutex) != 0)
goto fail1;
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
if (os_sem_init(&out->update_semaphore, 0) != 0)
goto fail2;
2013-09-30 19:37:13 -07:00
if (pthread_create(&out->thread, NULL, video_thread, out) != 0)
goto fail3;
2013-09-30 19:37:13 -07:00
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
init_cache(out);
2013-09-30 19:37:13 -07:00
out->initialized = true;
*video = out;
return VIDEO_OUTPUT_SUCCESS;
fail3:
os_sem_destroy(out->update_semaphore);
fail2:
pthread_mutex_destroy(&out->input_mutex);
fail1:
pthread_mutex_destroy(&out->data_mutex);
fail0:
2013-09-30 19:37:13 -07:00
video_output_close(out);
return VIDEO_OUTPUT_FAIL;
}
void video_output_close(video_t *video)
{
if (!video)
return;
video_output_stop(video);
for (size_t i = 0; i < video->inputs.num; i++)
video_input_free(&video->inputs.array[i]);
da_free(video->inputs);
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
for (size_t i = 0; i < video->info.cache_size; i++)
video_frame_free((struct video_frame *)&video->cache[i]);
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
bfree(video);
}
static size_t video_get_input_idx(const video_t *video,
void (*callback)(void *param,
struct video_data *frame),
void *param)
{
for (size_t i = 0; i < video->inputs.num; i++) {
struct video_input *input = video->inputs.array + i;
if (input->callback == callback && input->param == param)
return i;
}
return DARRAY_INVALID;
}
static inline bool video_input_init(struct video_input *input,
struct video_output *video)
{
if (input->conversion.width != video->info.width ||
input->conversion.height != video->info.height ||
input->conversion.format != video->info.format) {
struct video_scale_info from = {.format = video->info.format,
.width = video->info.width,
.height = video->info.height,
.range = video->info.range,
.colorspace =
video->info.colorspace};
int ret = video_scaler_create(&input->scaler,
&input->conversion, &from,
VIDEO_SCALE_FAST_BILINEAR);
if (ret != VIDEO_SCALER_SUCCESS) {
if (ret == VIDEO_SCALER_BAD_CONVERSION)
blog(LOG_ERROR, "video_input_init: Bad "
"scale conversion type");
else
blog(LOG_ERROR, "video_input_init: Failed to "
"create scaler");
return false;
}
for (size_t i = 0; i < MAX_CONVERT_BUFFERS; i++)
video_frame_init(&input->frame[i],
input->conversion.format,
input->conversion.width,
input->conversion.height);
}
return true;
}
static inline void reset_frames(video_t *video)
{
os_atomic_set_long(&video->skipped_frames, 0);
os_atomic_set_long(&video->total_frames, 0);
}
bool video_output_connect(
video_t *video, const struct video_scale_info *conversion,
void (*callback)(void *param, struct video_data *frame), void *param)
{
bool success = false;
2014-02-23 21:39:33 -08:00
if (!video || !callback)
return false;
pthread_mutex_lock(&video->input_mutex);
if (video_get_input_idx(video, callback, param) == DARRAY_INVALID) {
struct video_input input;
memset(&input, 0, sizeof(input));
input.callback = callback;
input.param = param;
if (conversion) {
input.conversion = *conversion;
} else {
input.conversion.format = video->info.format;
input.conversion.width = video->info.width;
input.conversion.height = video->info.height;
}
if (input.conversion.width == 0)
input.conversion.width = video->info.width;
if (input.conversion.height == 0)
input.conversion.height = video->info.height;
success = video_input_init(&input, video);
if (success) {
if (video->inputs.num == 0) {
if (!os_atomic_load_long(&video->gpu_refs)) {
reset_frames(video);
}
os_atomic_set_bool(&video->raw_active, true);
}
da_push_back(video->inputs, &input);
}
}
pthread_mutex_unlock(&video->input_mutex);
return success;
}
static void log_skipped(video_t *video)
{
long skipped = os_atomic_load_long(&video->skipped_frames);
double percentage_skipped =
(double)skipped /
(double)os_atomic_load_long(&video->total_frames) * 100.0;
if (skipped)
blog(LOG_INFO,
"Video stopped, number of "
"skipped frames due "
"to encoding lag: "
"%ld/%ld (%0.1f%%)",
video->skipped_frames, video->total_frames,
percentage_skipped);
}
void video_output_disconnect(video_t *video,
void (*callback)(void *param,
struct video_data *frame),
void *param)
{
2014-02-23 21:39:33 -08:00
if (!video || !callback)
return;
pthread_mutex_lock(&video->input_mutex);
size_t idx = video_get_input_idx(video, callback, param);
if (idx != DARRAY_INVALID) {
video_input_free(video->inputs.array + idx);
da_erase(video->inputs, idx);
if (video->inputs.num == 0) {
os_atomic_set_bool(&video->raw_active, false);
if (!os_atomic_load_long(&video->gpu_refs)) {
log_skipped(video);
}
}
}
pthread_mutex_unlock(&video->input_mutex);
}
bool video_output_active(const video_t *video)
{
if (!video)
return false;
return os_atomic_load_bool(&video->raw_active);
}
const struct video_output_info *video_output_get_info(const video_t *video)
{
2014-02-23 21:39:33 -08:00
return video ? &video->info : NULL;
}
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
bool video_output_lock_frame(video_t *video, struct video_frame *frame,
int count, uint64_t timestamp)
2013-09-30 19:37:13 -07:00
{
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
struct cached_frame_info *cfi;
bool locked;
if (!video)
return false;
2014-02-23 21:39:33 -08:00
2013-09-30 19:37:13 -07:00
pthread_mutex_lock(&video->data_mutex);
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
if (video->available_frames == 0) {
video->cache[video->last_added].count += count;
video->cache[video->last_added].skipped += count;
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
locked = false;
} else {
if (video->available_frames != video->info.cache_size) {
if (++video->last_added == video->info.cache_size)
video->last_added = 0;
}
cfi = &video->cache[video->last_added];
cfi->frame.timestamp = timestamp;
cfi->count = count;
cfi->skipped = 0;
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
memcpy(frame, &cfi->frame, sizeof(*frame));
locked = true;
}
2013-09-30 19:37:13 -07:00
pthread_mutex_unlock(&video->data_mutex);
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
return locked;
2013-09-30 19:37:13 -07:00
}
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
void video_output_unlock_frame(video_t *video)
2013-09-30 19:37:13 -07:00
{
if (!video)
return;
2014-02-23 21:39:33 -08:00
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
pthread_mutex_lock(&video->data_mutex);
video->available_frames--;
os_sem_post(video->update_semaphore);
pthread_mutex_unlock(&video->data_mutex);
2013-09-30 19:37:13 -07:00
}
uint64_t video_output_get_frame_time(const video_t *video)
2013-09-30 19:37:13 -07:00
{
2014-02-23 21:39:33 -08:00
return video ? video->frame_time : 0;
2013-09-30 19:37:13 -07:00
}
void video_output_stop(video_t *video)
2013-09-30 19:37:13 -07:00
{
void *thread_ret;
if (!video)
return;
if (video->initialized) {
video->initialized = false;
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
video->stop = true;
os_sem_post(video->update_semaphore);
2013-09-30 19:37:13 -07:00
pthread_join(video->thread, &thread_ret);
os_sem_destroy(video->update_semaphore);
pthread_mutex_destroy(&video->data_mutex);
pthread_mutex_destroy(&video->input_mutex);
2013-09-30 19:37:13 -07:00
}
}
libobs: Redesign/optimize frame encoding handling Previously, the design for the interaction between the encoder thread and the graphics thread was that the encoder thread would signal to the graphics thread when to start drawing each frame. The original idea behind this was to prevent mutually cascading stalls of encoding or graphics rendering (i.e., if rendering took too long, then encoding would have to catch up, then rendering would have to catch up again, and so on, cascading upon each other). The ultimate goal was to prevent encoding from impacting graphics and vise versa. However, eventually it was realized that there were some fundamental flaws with this design. 1. Stray frame duplication. You could not guarantee that a frame would render on time, so sometimes frames would unintentionally be lost if there was any sort of minor hiccup or if the thread took too long to be scheduled I'm guessing. 2. Frame timing in the rendering thread was less accurate. The only place where frame timing was accurate was in the encoder thread, and the graphics thread was at the whim of thread scheduling. On higher end computers it was typically fine, but it was just generally not guaranteed that a frame would be rendered when it was supposed to be rendered. So the solution (originally proposed by r1ch and paibox) is to instead keep the encoding and graphics threads separate as usual, but instead of the encoder thread controlling the graphics thread, the graphics thread now controls the encoder thread. The encoder thread keeps a limited cache of frames, then the graphics thread copies frames in to the cache and increments a semaphore to schedule the encoder thread to encode that data. In the cache, each frame has an encode counter. If the frame cache is full (e.g., the encoder taking too long to return frames), it will not cache a new frame, but instead will just increment the counter on the last frame in the cache to schedule that frame to encode again, ensuring that frames are on time and reducing CPU usage by lowering video complexity. If the graphics thread takes too long to render a frame, then it will add that frame with the count value set to the total amount of frames that were missed (actual legitimately duplicated frames). Because the cache gives many frames of breathing room for the encoder to encode frames, this design helps improve results especially when using encoding presets that have higher complexity and CPU usage, minimizing the risk of needlessly skipped or duplicated frames. I also managed to sneak in what should be a bit of an optimization to reduce copying of frame data, though how much of an optimization it ultimately ends up being is debatable. So to sum it up, this commit increases accuracy of frame timing, completely removes stray frame duplication, gives better results for higher complexity encoding presets, and potentially optimizes the frame pipeline a tiny bit.
2014-12-31 01:53:13 -08:00
bool video_output_stopped(video_t *video)
{
if (!video)
return true;
return video->stop;
}
enum video_format video_output_get_format(const video_t *video)
{
return video ? video->info.format : VIDEO_FORMAT_NONE;
}
uint32_t video_output_get_width(const video_t *video)
{
return video ? video->info.width : 0;
}
uint32_t video_output_get_height(const video_t *video)
{
return video ? video->info.height : 0;
}
double video_output_get_frame_rate(const video_t *video)
{
if (!video)
return 0.0;
return (double)video->info.fps_num / (double)video->info.fps_den;
}
uint32_t video_output_get_skipped_frames(const video_t *video)
{
return (uint32_t)os_atomic_load_long(&video->skipped_frames);
}
uint32_t video_output_get_total_frames(const video_t *video)
{
return (uint32_t)os_atomic_load_long(&video->total_frames);
}
/* Note: These four functions below are a very slight bit of a hack. If the
* texture encoder thread is active while the raw encoder thread is active, the
* total frame count will just be doubled while they're both active. Which is
* fine. What's more important is having a relatively accurate skipped frame
* count. */
void video_output_inc_texture_encoders(video_t *video)
{
if (os_atomic_inc_long(&video->gpu_refs) == 1 &&
!os_atomic_load_bool(&video->raw_active)) {
reset_frames(video);
}
}
void video_output_dec_texture_encoders(video_t *video)
{
if (os_atomic_dec_long(&video->gpu_refs) == 0 &&
!os_atomic_load_bool(&video->raw_active)) {
log_skipped(video);
}
}
void video_output_inc_texture_frames(video_t *video)
{
os_atomic_inc_long(&video->total_frames);
}
void video_output_inc_texture_skipped_frames(video_t *video)
{
os_atomic_inc_long(&video->skipped_frames);
}