IAXClient: codec_theora.c Source File

00001 /*
00002  * iaxclient: a cross-platform IAX softphone library
00003  *
00004  * Copyrights:
00005  * Copyright (C) 2003-2006, Horizon Wimba, Inc.
00006  * Copyright (C) 2007, Wimba, Inc.
00007  *
00008  * Contributors:
00009  * Steve Kann <stevek@stevek.com>
00010  * Mihai Balea <mihai at hates dot ms>
00011  *
00012  * This program is free software, distributed under the terms of
00013  * the GNU Lesser (Library) General Public License.
00014  */
00015 
00016 /*
00017  * Some comments about Theora streaming
00018  * Theora video codec has two problems when it comes to streaming
00019  * and broadcasting video:
00020  *
00021  * - Large headers that need to be passed from the encoder to the decoder
00022  *   to initialize it. The conventional wisdom says we should transfer the
00023  *   headers out of band, but that complicates things with IAX, which does
00024  *   not have a separate signalling channel. Also, it makes things really
00025  *   difficult in a video conference scenario, where video gets switched
00026  *   between participants regularly. To solve this issue, we initialize
00027  *   the encoder and the decoder at the same time, using the headers from
00028  *   the local encoder to initialize the decoder. This works if the
00029  *   endpoints use the exact same version of Theora and the exact same
00030  *   parameters for initialization.
00031  *
00032  * - No support for splitting the frame into multiple slices.  Frames can
00033  *   be relatively large. For a 320x240 video stream, you can see key
00034  *   frames larger than 9KB, which is the maximum UDP packet size on Mac
00035  *   OS X. We split the encoded frame artificially into slices that will
00036  *   fit into a typical MTU.  We also add six bytes at the beginning of
00037  *   each slice.
00038  *
00039  *   - version: right now, first bit should be 0, the rest are undefined
00040  *
00041  *   - source id: 2 bytes random number used to identify stream changes in
00042  *     conference applications this number is transmitted in big endian
00043  *     format over the wire
00044  *
00045  *   - frame index number - used to detect a new frame when some of the
00046  *     slices of the current frame are missing (only the least significant
00047  *     4 bits are used)
00048  *
00049  *   - index of slice in the frame, starting at 0
00050  *
00051  *   - total number of slices in the frame
00052  *
00053  * Other miscellaneous comments:
00054  *
00055  * - For quality reasons, when we detect a video stream switch, we reject all
00056  *   incoming frames until we receive a key frame.
00057  *
00058  * - Theora only accepts video that has dimensions multiple of 16. If we combine
00059  *   his with a 4:3 aspect ratio requirement, we get a very limited number
00060  *   of available resolutions. To work around this limitation, we pad the video
00061  *   on encoding, up to the closest multiple of 16. On the decoding side, we
00062  *   remove the padding. This way, video resolution can be any multiple of 2
00063  *
00064  * We should probably look more into this (how to deal with missing and
00065  * out of order slices)
00066  */
00067 
00068 #include <stdlib.h>
00069 #include "iaxclient_lib.h"
00070 #include "video.h"
00071 #include "codec_theora.h"
00072 #include <theora/theora.h>
00073 
00074 #define MAX_SLICE_SIZE          8000
00075 #define MAX_ENCODED_FRAME_SIZE  48*1024
00076 
00077 struct theora_decoder
00078 {
00079         theora_state    td;
00080         theora_info     ti;
00081         theora_comment  tc;
00082         unsigned char   frame_index;
00083         unsigned char   slice_count;
00084         int             frame_size;
00085         unsigned short  source_id;
00086         int             got_key_frame;
00087         unsigned char   buffer[MAX_ENCODED_FRAME_SIZE];
00088 };
00089 
00090 struct theora_encoder
00091 {
00092         theora_state    td;
00093         theora_info     ti;
00094         theora_comment  tc;
00095         int             needs_padding;
00096         unsigned char   frame_index;
00097         unsigned short  source_id;
00098         unsigned char   *pad_buffer;
00099 };
00100 
00101 static void destroy( struct iaxc_video_codec *c)
00102 {
00103         struct theora_encoder *e;
00104         struct theora_decoder *d;
00105 
00106         if ( !c )
00107                 return;
00108 
00109         if ( c->encstate )
00110         {
00111                 e = (struct theora_encoder *)c->encstate;
00112                 if ( e->pad_buffer )
00113                         free(e->pad_buffer);
00114                 theora_comment_clear(&e->tc);
00115                 theora_info_clear(&e->ti);
00116                 theora_clear(&e->td);
00117                 free(e);
00118         }
00119         if ( c->decstate )
00120         {
00121                 d = (struct theora_decoder *)c->decstate;
00122                 theora_comment_clear(&d->tc);
00123                 theora_info_clear(&d->ti);
00124                 theora_clear(&d->td);
00125                 free(c->decstate);
00126         }
00127         free(c);
00128 }
00129 
00130 static void reset_decoder_frame_state(struct theora_decoder * d)
00131 {
00132         memset(d->buffer, 0, MAX_ENCODED_FRAME_SIZE);
00133         d->frame_size = 0;
00134         d->slice_count = 0;
00135 }
00136 
00137 static int pass_frame_to_decoder(struct theora_decoder *d, int *outlen, char *out)
00138 {
00139         ogg_packet      op;
00140         yuv_buffer      picture;
00141         unsigned int    line;
00142         int             my_out_len;
00143         int             w, h, ph;
00144 
00145         /* decode into an OP structure */
00146         memset(&op, 0, sizeof(op));
00147         op.bytes = d->frame_size;
00148         op.packet = d->buffer;
00149 
00150         /* reject all incoming frames until we get a key frame */
00151         if ( !d->got_key_frame )
00152         {
00153                 if ( theora_packet_iskeyframe(&op) )
00154                         d->got_key_frame = 1;
00155                 else
00156                         return 1;
00157         }
00158 
00159         if ( theora_decode_packetin(&d->td, &op) == OC_BADPACKET )
00160         {
00161                 fprintf(stderr,
00162                         "codec_theora: warning: theora_decode_packetin says bad packet\n");
00163                 return -1;
00164         }
00165 
00166         w = d->ti.frame_width;
00167         h = d->ti.frame_height;
00168         ph = d->ti.height;
00169 
00170         my_out_len = d->ti.frame_width * d->ti.frame_height * 3 / 2;
00171 
00172         /* make sure we have enough room for the goodies */
00173         if ( *outlen < my_out_len )
00174         {
00175                 fprintf(stderr, "codec_theora: not enough room for decoding\n");
00176                 return -1;
00177         }
00178 
00179         /* finally, here's where we get our goodies */
00180         if ( theora_decode_YUVout(&d->td, &picture) )
00181         {
00182                 fprintf(stderr, "codec_theora: error getting our goodies\n");
00183                 return -1;
00184         }
00185 
00186         //clear output
00187         memset(out, 127, my_out_len);
00188 
00189         for( line = 0 ; line < d->ti.frame_height / 2 ; line++ )
00190         {
00191                 // Y-even
00192                 memcpy(out + picture.y_width * 2 * line,
00193                                 picture.y + 2 * line * picture.y_stride,
00194                                 picture.y_width);
00195                 // Y-odd
00196                 memcpy(out + picture.y_width * (2 * line + 1),
00197                                 picture.y + (2 * line + 1) * picture.y_stride,
00198                                 picture.y_width);
00199                 // U + V
00200                 memcpy(out + (d->ti.frame_width * d->ti.frame_height) +
00201                                 line * d->ti.frame_width / 2,
00202                                 picture.u + line * picture.uv_stride,
00203                                 picture.uv_width);
00204                 memcpy(out + (d->ti.frame_width * d->ti.frame_height * 5 / 4) +
00205                                 line * d->ti.frame_width / 2,
00206                                 picture.v + line * picture.uv_stride,
00207                                 picture.uv_width);
00208         }
00209 
00210         *outlen = my_out_len;
00211 
00212         return 0;
00213 }
00214 
00215 static int decode(struct iaxc_video_codec *c, int inlen, char *in, int *outlen, char *out)
00216 {
00217         struct theora_decoder   *d;
00218         unsigned char           frame_index, slice_index, num_slices, version;
00219         unsigned short          source_id;
00220 
00221         // Sanity checks
00222         if ( !c || !c->decstate || !in || inlen <= 0 || !out || !outlen )
00223                 return -1;
00224 
00225         d = (struct theora_decoder *)c->decstate;
00226 
00227         version = *in++;
00228         source_id = (unsigned short)(*in++) << 8;
00229         source_id |= *in++;
00230         frame_index = *in++ & 0x0f;
00231         slice_index = *in++;
00232         num_slices = *in++;
00233         inlen -= 6;
00234 
00235         if ( version & 0x80 )
00236         {
00237                 fprintf(stderr, "Theora: unknown slice protocol\n");
00238                 return -1;
00239         }
00240 
00241         if ( source_id == d->source_id )
00242         {
00243                 /* We use only the least significant bits to calculate delta
00244                  * this helps with conferencing and video muting/unmuting
00245                  */
00246                 unsigned char frame_delta = (frame_index - d->frame_index) & 0x0f;
00247 
00248                 if ( frame_delta > 8 )
00249                 {
00250                         /* Old slice coming in late, ignore. */
00251                         return 1;
00252                 } else if ( frame_delta > 0 )
00253                 {
00254                         /* Slice belongs to a new frame */
00255                         d->frame_index = frame_index;
00256 
00257                         if ( d->slice_count > 0 )
00258                         {
00259                                 /* Current frame is incomplete, drop it */
00260                                 c->video_stats.dropped_frames++;
00261                                 reset_decoder_frame_state(d);
00262                         }
00263                 }
00264         } else
00265         {
00266                 /* Video stream was switched, the existing frame/slice
00267                  * indexes are meaningless.
00268                  */
00269                 reset_decoder_frame_state(d);
00270                 d->source_id = source_id;
00271                 d->frame_index = frame_index;
00272                 d->got_key_frame = 0;
00273         }
00274 
00275         // Process current slice
00276         if ( c->fragsize * slice_index + inlen > MAX_ENCODED_FRAME_SIZE )
00277         {
00278                 // Frame would be too large, ignore slice
00279                 return -1;
00280         }
00281 
00282         memcpy(d->buffer + c->fragsize * slice_index, in, inlen);
00283         d->slice_count++;
00284 
00285         /* We only know the size of the frame when we get the final slice */
00286         if ( slice_index == num_slices - 1 )
00287                 d->frame_size = c->fragsize * slice_index + inlen;
00288 
00289         if ( d->slice_count < num_slices )
00290         {
00291                 // we're still waiting for some slices
00292                 return 1;
00293         } else
00294         {
00295                 // Frame complete, send to decoder
00296                 int ret = pass_frame_to_decoder(d, outlen, out);
00297 
00298                 // Clean up in preparation for next frame
00299                 reset_decoder_frame_state(d);
00300 
00301                 return ret;
00302         }
00303 }
00304 
00305 // Pads a w by h frame to bring it up to pw by ph size using value
00306 static void pad_channel(const char *src, int w, int h, unsigned char *dst,
00307                 int pw, int ph, unsigned char value)
00308 {
00309         int i;
00310 
00311         if ( w == pw )
00312         {
00313                 // We don't need to pad each line, just copy the data
00314                 memcpy(dst, src, w * h);
00315         } else
00316         {
00317                 // We DO need to pad each line
00318                 for ( i=0 ; i<h ; i++ )
00319                 {
00320                         memcpy(&dst[i*pw], &src[i*w], w);
00321                         memset(&dst[i*pw+w], value, pw-w);
00322                 }
00323         }
00324         // Pad the bottom of the frame if necessary
00325         if ( h < ph )
00326                 memset(dst + pw * h, value, (ph - h) * pw);
00327 }
00328 
00329 static int encode(struct iaxc_video_codec *c, int inlen, char *in,
00330                 struct slice_set_t *slice_set)
00331 {
00332         int                     i, size, ssize;
00333         const unsigned char     *p;
00334         struct theora_encoder   *e;
00335         ogg_packet              op;
00336         yuv_buffer              picture;
00337 
00338         // Sanity checks
00339         if ( !c || !c->encstate || !in || !slice_set )
00340                 return -1;
00341 
00342         e = (struct theora_encoder *)c->encstate;
00343 
00344         // Prepare the YUV buffer
00345         if ( e->needs_padding )
00346         {
00347                 // We copy a padded image into the pad buffer and set up the pointers
00348                 // Use pad_channel for each of the YUV channels
00349                 // Use a pad value of 0 for luma and 128 for chroma
00350                 pad_channel(in,
00351                                 e->ti.frame_width,
00352                                 e->ti.frame_height,
00353                                 e->pad_buffer,
00354                                 e->ti.width,
00355                                 e->ti.height,
00356                                 0);
00357 
00358                 pad_channel(in + e->ti.frame_width * e->ti.frame_height,
00359                                 e->ti.frame_width / 2,
00360                                 e->ti.frame_height / 2,
00361                                 e->pad_buffer + e->ti.width * e->ti.height,
00362                                 e->ti.width / 2,
00363                                 e->ti.height / 2,
00364                                 128);
00365 
00366                 pad_channel(in + e->ti.frame_width * e->ti.frame_height * 5 / 4,
00367                                 e->ti.frame_width / 2,
00368                                 e->ti.frame_height / 2,
00369                                 e->pad_buffer + e->ti.width * e->ti.height * 5 / 4,
00370                                 e->ti.width / 2,
00371                                 e->ti.height / 2,
00372                                 128);
00373 
00374                 picture.y = e->pad_buffer;
00375         } else
00376         {
00377                 // use the original buffer
00378                 picture.y = (unsigned char *)in;
00379         }
00380         picture.u = picture.y + e->ti.width * e->ti.height;
00381         picture.v = picture.u + e->ti.width * e->ti.height / 4;
00382         picture.y_width = e->ti.width;
00383         picture.y_height = e->ti.height;
00384         picture.y_stride = e->ti.width;
00385         picture.uv_width = e->ti.width / 2;
00386         picture.uv_height = e->ti.height / 2;
00387         picture.uv_stride = e->ti.width / 2;
00388 
00389         // Send data in for encoding
00390         if ( theora_encode_YUVin(&e->td, &picture) )
00391         {
00392                 fprintf(stderr, "codec_theora: failed theora_encode_YUVin\n");
00393                 return -1;
00394         }
00395 
00396         // Get data from the encoder
00397         if ( theora_encode_packetout(&e->td, 0, &op) != 1 )
00398         {
00399                 fprintf(stderr, "codec_theora: failed theora_encode_packetout\n");
00400                 return -1;
00401         }
00402 
00403         // Check to see if we have a key frame
00404         slice_set->key_frame = theora_packet_iskeyframe(&op) == 1;
00405 
00406         // We need to split the frame into one or more slices
00407         p = op.packet;
00408         size = op.bytes;
00409 
00410         // Figure out how many slices we need
00411         slice_set->num_slices = (size - 1) / c->fragsize + 1;
00412 
00413         // Copy up to fragsize bytes into each slice
00414         for ( i = 0; i < slice_set->num_slices; i++ )
00415         {
00416                 slice_set->data[i][0] = 0;
00417                 slice_set->data[i][1] = (unsigned char)(e->source_id >> 8);
00418                 slice_set->data[i][2] = (unsigned char)(e->source_id & 0xff);
00419                 slice_set->data[i][3] = e->frame_index;
00420                 slice_set->data[i][4] = (unsigned char)i;
00421                 slice_set->data[i][5] = (unsigned char)slice_set->num_slices;
00422                 ssize = (i == slice_set->num_slices - 1) ?
00423                         size % c->fragsize : c->fragsize;
00424                 memcpy(&slice_set->data[i][6], p, ssize);
00425                 slice_set->size[i] = ssize + 6;
00426                 p += ssize;
00427         }
00428         e->frame_index++;
00429 
00430         return 0;
00431 }
00432 
00433 struct iaxc_video_codec *codec_video_theora_new(int format, int w, int h,
00434                 int framerate, int bitrate, int fragsize)
00435 {
00436         struct iaxc_video_codec *c;
00437         struct theora_encoder   *e;
00438         struct theora_decoder   *d;
00439         ogg_packet headerp, commentp, tablep;
00440 
00441         /* Basic sanity checks */
00442         if ( w <= 0 || h <= 0 || framerate <= 0 || bitrate <= 0 || fragsize <= 0 )
00443         {
00444                 fprintf(stderr, "codec_theora: bogus codec params: %d %d %d %d %d\n",
00445                                 w, h, framerate, bitrate, fragsize);
00446                 return NULL;
00447         }
00448 
00449         if ( w % 2 || h % 2 )
00450         {
00451                 fprintf(stderr, "codec_theora: video dimensions must be multiples of 2\n");
00452                 return NULL;
00453         }
00454 
00455         if ( fragsize > MAX_SLICE_SIZE )
00456                 fragsize = MAX_SLICE_SIZE;
00457 
00458         c = (struct iaxc_video_codec *)calloc(sizeof(struct iaxc_video_codec), 1);
00459 
00460         if ( !c )
00461                 goto bail;
00462 
00463         c->decstate = calloc(sizeof(struct theora_decoder), 1);
00464 
00465         if ( !c->decstate )
00466                 goto bail;
00467 
00468         c->encstate = calloc(sizeof(struct theora_encoder), 1);
00469 
00470         if ( !c->encstate )
00471                 goto bail;
00472 
00473         c->format = format;
00474         c->width = w;
00475         c->height = h;
00476         c->framerate = framerate;
00477         c->bitrate = bitrate;
00478         c->fragsize = fragsize;
00479 
00480         c->encode = encode;
00481         c->decode = decode;
00482         c->destroy = destroy;
00483 
00484         e = (struct theora_encoder *)c->encstate;
00485         d = (struct theora_decoder *)c->decstate;
00486 
00487         /* set up some parameters in the contexts */
00488 
00489         theora_info_init(&e->ti);
00490 
00491         /* set up common parameters */
00492         e->ti.frame_width = w;
00493         e->ti.frame_height = h;
00494         e->ti.width = ((w - 1) / 16 + 1) * 16;
00495         e->ti.height = ((h - 1) / 16 + 1) * 16;
00496         e->ti.offset_x = 0;
00497         e->ti.offset_y = 0;
00498 
00499         // We set up a padded frame with dimensions that are multiple of 16
00500         // We allocate a buffer to hold this frame
00501         e->needs_padding = e->ti.width != e->ti.frame_width ||
00502                 e->ti.height != e->ti.frame_height;
00503 
00504         if ( e->needs_padding )
00505         {
00506                 e->pad_buffer = (unsigned char *)
00507                         malloc(e->ti.width * e->ti.height * 3 / 2);
00508 
00509                 if ( !e->pad_buffer )
00510                         goto bail;
00511         }
00512         else
00513         {
00514                 e->pad_buffer = 0;
00515         }
00516 
00517         e->ti.fps_numerator = framerate;
00518         e->ti.fps_denominator = 1;
00519 
00520         e->ti.aspect_numerator = 1;
00521         e->ti.aspect_denominator = 1;
00522 
00523         e->ti.colorspace = OC_CS_UNSPECIFIED;
00524         e->ti.pixelformat = OC_PF_420;
00525 
00526         e->ti.target_bitrate = bitrate;
00527 
00528         e->ti.quality = 0;
00529 
00530         e->ti.dropframes_p = 0;
00531         e->ti.quick_p = 1;
00532         e->ti.keyframe_auto_p = 0;
00533         e->ti.keyframe_frequency = framerate;
00534         e->ti.keyframe_frequency_force = framerate;
00535         e->ti.keyframe_data_target_bitrate = bitrate * 3;
00536         e->ti.keyframe_auto_threshold = 80;
00537         e->ti.keyframe_mindistance = 8;
00538         e->ti.noise_sensitivity = 0;
00539 
00540         if ( theora_encode_init(&e->td, &e->ti) )
00541                 goto bail;
00542 
00543         // Obtain the encoder headers and set up the decoder headers from
00544         // data in the encoder headers
00545         memset(&headerp, 0, sizeof(headerp));
00546         memset(&commentp, 0, sizeof(commentp));
00547         memset(&tablep, 0, sizeof(tablep));
00548 
00549         // Set up the decoder using the encoder headers
00550         theora_info_init(&d->ti);
00551         theora_comment_init(&d->tc);
00552         theora_comment_init(&e->tc);
00553 
00554         if ( theora_encode_header(&e->td, &headerp) )
00555                 goto bail;
00556 
00557         headerp.b_o_s = 1;
00558 
00559         if ( theora_decode_header(&d->ti, &d->tc, &headerp) )
00560                 goto bail;
00561 
00562         if ( theora_encode_comment(&e->tc, &commentp) )
00563                 goto bail;
00564 
00565         if ( theora_decode_header(&d->ti, &d->tc, &commentp) )
00566                 goto bail;
00567 
00568         theora_comment_clear(&e->tc);
00569 
00570         if ( theora_encode_tables(&e->td, &tablep) )
00571                 goto bail;
00572 
00573         if ( theora_decode_header(&d->ti, &d->tc, &tablep) )
00574                 goto bail;
00575 
00576         if ( theora_decode_init(&d->td, &d->ti) )
00577                 goto bail;
00578 
00579         // Generate random source id
00580         srand((unsigned int)time(0));
00581         e->source_id = rand() & 0xffff;
00582 
00583         d->got_key_frame = 0;
00584 
00585         strcpy(c->name, "Theora");
00586         return c;
00587 
00588 bail:
00589         fprintf(stderr, "codec_theora: failed to initialize encoder or decoder\n");
00590 
00591         if ( c )
00592         {
00593                 if ( c->encstate )
00594                         free(c->encstate);
00595 
00596                 if ( c->decstate )
00597                         free(c->decstate);
00598 
00599                 free(c);
00600         }
00601 
00602         return NULL;
00603 }
00604