OpenCV  4.5.2
Open Source Computer Vision
AKAZE and ORB planar tracking

Prev Tutorial: AKAZE local features matching
Next Tutorial: Basic concepts of the homography explained with code

Original author Fedor Morozov
Compatibility OpenCV >= 3.0

Introduction

In this tutorial we will compare AKAZE and ORB local features using them to find matches between video frames and track object movements.

The algorithm is as follows:

  • Detect and describe keypoints on the first frame, manually set object boundaries
  • For every next frame:
    1. Detect and describe keypoints
    2. Match them using bruteforce matcher
    3. Estimate homography transformation using RANSAC
    4. Filter inliers from all the matches
    5. Apply homography transformation to the bounding box to find the object
    6. Draw bounding box and inliers, compute inlier ratio as evaluation metric

Data

To do the tracking we need a video and object position on the first frame.

You can download our example video and data from here.

To run the code you have to specify input (camera id or video_file). Then, select a bounding box with the mouse, and press any key to start tracking

./planar_tracking blais.mp4

Source Code

#include <opencv2/highgui.hpp> //for imshow
#include <vector>
#include <iostream>
#include <iomanip>
#include "stats.h" // Stats structure definition
#include "utils.h" // Drawing and printing functions
using namespace std;
using namespace cv;
const double akaze_thresh = 3e-4; // AKAZE detection threshold set to locate about 1000 keypoints
const double ransac_thresh = 2.5f; // RANSAC inlier threshold
const double nn_match_ratio = 0.8f; // Nearest-neighbour matching ratio
const int bb_min_inliers = 100; // Minimal number of inliers to draw bounding box
const int stats_update_period = 10; // On-screen statistics are updated every 10 frames
namespace example {
class Tracker
{
public:
detector(_detector),
matcher(_matcher)
{}
void setFirstFrame(const Mat frame, vector<Point2f> bb, string title, Stats& stats);
Mat process(const Mat frame, Stats& stats);
Ptr<Feature2D> getDetector() {
return detector;
}
protected:
Ptr<Feature2D> detector;
Mat first_frame, first_desc;
vector<KeyPoint> first_kp;
vector<Point2f> object_bb;
};
void Tracker::setFirstFrame(const Mat frame, vector<Point2f> bb, string title, Stats& stats)
{
cv::Point *ptMask = new cv::Point[bb.size()];
const Point* ptContain = { &ptMask[0] };
int iSize = static_cast<int>(bb.size());
for (size_t i=0; i<bb.size(); i++) {
ptMask[i].x = static_cast<int>(bb[i].x);
ptMask[i].y = static_cast<int>(bb[i].y);
}
first_frame = frame.clone();
cv::Mat matMask = cv::Mat::zeros(frame.size(), CV_8UC1);
cv::fillPoly(matMask, &ptContain, &iSize, 1, cv::Scalar::all(255));
detector->detectAndCompute(first_frame, matMask, first_kp, first_desc);
stats.keypoints = (int)first_kp.size();
drawBoundingBox(first_frame, bb);
putText(first_frame, title, Point(0, 60), FONT_HERSHEY_PLAIN, 5, Scalar::all(0), 4);
object_bb = bb;
delete[] ptMask;
}
Mat Tracker::process(const Mat frame, Stats& stats)
{
vector<KeyPoint> kp;
Mat desc;
tm.start();
detector->detectAndCompute(frame, noArray(), kp, desc);
stats.keypoints = (int)kp.size();
vector< vector<DMatch> > matches;
vector<KeyPoint> matched1, matched2;
matcher->knnMatch(first_desc, desc, matches, 2);
for(unsigned i = 0; i < matches.size(); i++) {
if(matches[i][0].distance < nn_match_ratio * matches[i][1].distance) {
matched1.push_back(first_kp[matches[i][0].queryIdx]);
matched2.push_back( kp[matches[i][0].trainIdx]);
}
}
stats.matches = (int)matched1.size();
Mat inlier_mask, homography;
vector<KeyPoint> inliers1, inliers2;
vector<DMatch> inlier_matches;
if(matched1.size() >= 4) {
homography = findHomography(Points(matched1), Points(matched2),
RANSAC, ransac_thresh, inlier_mask);
}
tm.stop();
stats.fps = 1. / tm.getTimeSec();
if(matched1.size() < 4 || homography.empty()) {
Mat res;
hconcat(first_frame, frame, res);
stats.inliers = 0;
stats.ratio = 0;
return res;
}
for(unsigned i = 0; i < matched1.size(); i++) {
if(inlier_mask.at<uchar>(i)) {
int new_i = static_cast<int>(inliers1.size());
inliers1.push_back(matched1[i]);
inliers2.push_back(matched2[i]);
inlier_matches.push_back(DMatch(new_i, new_i, 0));
}
}
stats.inliers = (int)inliers1.size();
stats.ratio = stats.inliers * 1.0 / stats.matches;
vector<Point2f> new_bb;
perspectiveTransform(object_bb, new_bb, homography);
Mat frame_with_bb = frame.clone();
if(stats.inliers >= bb_min_inliers) {
drawBoundingBox(frame_with_bb, new_bb);
}
Mat res;
drawMatches(first_frame, inliers1, frame_with_bb, inliers2,
inlier_matches, res,
Scalar(255, 0, 0), Scalar(255, 0, 0));
return res;
}
}
int main(int argc, char **argv)
{
CommandLineParser parser(argc, argv, "{@input_path |0|input path can be a camera id, like 0,1,2 or a video filename}");
parser.printMessage();
string input_path = parser.get<string>(0);
string video_name = input_path;
VideoCapture video_in;
if ( ( isdigit(input_path[0]) && input_path.size() == 1 ) )
{
int camera_no = input_path[0] - '0';
video_in.open( camera_no );
}
else {
video_in.open(video_name);
}
if(!video_in.isOpened()) {
cerr << "Couldn't open " << video_name << endl;
return 1;
}
Stats stats, akaze_stats, orb_stats;
Ptr<AKAZE> akaze = AKAZE::create();
akaze->setThreshold(akaze_thresh);
Ptr<ORB> orb = ORB::create();
Ptr<DescriptorMatcher> matcher = DescriptorMatcher::create("BruteForce-Hamming");
example::Tracker akaze_tracker(akaze, matcher);
example::Tracker orb_tracker(orb, matcher);
Mat frame;
namedWindow(video_name, WINDOW_NORMAL);
cout << "\nPress any key to stop the video and select a bounding box" << endl;
while ( waitKey(1) < 1 )
{
video_in >> frame;
cv::resizeWindow(video_name, frame.size());
imshow(video_name, frame);
}
vector<Point2f> bb;
cv::Rect uBox = cv::selectROI(video_name, frame);
bb.push_back(cv::Point2f(static_cast<float>(uBox.x), static_cast<float>(uBox.y)));
bb.push_back(cv::Point2f(static_cast<float>(uBox.x+uBox.width), static_cast<float>(uBox.y)));
bb.push_back(cv::Point2f(static_cast<float>(uBox.x+uBox.width), static_cast<float>(uBox.y+uBox.height)));
bb.push_back(cv::Point2f(static_cast<float>(uBox.x), static_cast<float>(uBox.y+uBox.height)));
akaze_tracker.setFirstFrame(frame, bb, "AKAZE", stats);
orb_tracker.setFirstFrame(frame, bb, "ORB", stats);
Stats akaze_draw_stats, orb_draw_stats;
Mat akaze_res, orb_res, res_frame;
int i = 0;
for(;;) {
i++;
bool update_stats = (i % stats_update_period == 0);
video_in >> frame;
// stop the program if no more images
if(frame.empty()) break;
akaze_res = akaze_tracker.process(frame, stats);
akaze_stats += stats;
if(update_stats) {
akaze_draw_stats = stats;
}
orb->setMaxFeatures(stats.keypoints);
orb_res = orb_tracker.process(frame, stats);
orb_stats += stats;
if(update_stats) {
orb_draw_stats = stats;
}
drawStatistics(akaze_res, akaze_draw_stats);
drawStatistics(orb_res, orb_draw_stats);
vconcat(akaze_res, orb_res, res_frame);
cv::imshow(video_name, res_frame);
if(waitKey(1)==27) break; //quit on ESC button
}
akaze_stats /= i - 1;
orb_stats /= i - 1;
printStatistics("AKAZE", akaze_stats);
printStatistics("ORB", orb_stats);
return 0;
}

Explanation

Tracker class

This class implements algorithm described abobve using given feature detector and descriptor matcher.

  • Setting up the first frame

    void Tracker::setFirstFrame(const Mat frame, vector<Point2f> bb, string title, Stats& stats)
    {
    first_frame = frame.clone();
    (*detector)(first_frame, noArray(), first_kp, first_desc);
    stats.keypoints = (int)first_kp.size();
    drawBoundingBox(first_frame, bb);
    putText(first_frame, title, Point(0, 60), FONT_HERSHEY_PLAIN, 5, Scalar::all(0), 4);
    object_bb = bb;
    }

    We compute and store keypoints and descriptors from the first frame and prepare it for the output.

    We need to save number of detected keypoints to make sure both detectors locate roughly the same number of those.

  • Processing frames
    1. Locate keypoints and compute descriptors

      (*detector)(frame, noArray(), kp, desc);

      To find matches between frames we have to locate the keypoints first.

      In this tutorial detectors are set up to find about 1000 keypoints on each frame.

    2. Use 2-nn matcher to find correspondences
      matcher->knnMatch(first_desc, desc, matches, 2);
      for(unsigned i = 0; i < matches.size(); i++) {
      if(matches[i][0].distance < nn_match_ratio * matches[i][1].distance) {
      matched1.push_back(first_kp[matches[i][0].queryIdx]);
      matched2.push_back( kp[matches[i][0].trainIdx]);
      }
      }
      If the closest match is nn_match_ratio closer than the second closest one, then it's a match.
    3. Use RANSAC to estimate homography transformation
      homography = findHomography(Points(matched1), Points(matched2),
      RANSAC, ransac_thresh, inlier_mask);
      If there are at least 4 matches we can use random sample consensus to estimate image transformation.
    4. Save the inliers
      for(unsigned i = 0; i < matched1.size(); i++) {
      if(inlier_mask.at<uchar>(i)) {
      int new_i = static_cast<int>(inliers1.size());
      inliers1.push_back(matched1[i]);
      inliers2.push_back(matched2[i]);
      inlier_matches.push_back(DMatch(new_i, new_i, 0));
      }
      }
      Since findHomography computes the inliers we only have to save the chosen points and matches.
    5. Project object bounding box

      perspectiveTransform(object_bb, new_bb, homography);

      If there is a reasonable number of inliers we can use estimated transformation to locate the object.

Results

You can watch the resulting video on youtube.

AKAZE statistics:

Matches 626
Inliers 410
Inlier ratio 0.58
Keypoints 1117

ORB statistics:

Matches 504
Inliers 319
Inlier ratio 0.56
Keypoints 1112
cv::noArray
InputOutputArray noArray()
calib3d.hpp
cv::Point_< int >
cv::Mat::clone
Mat clone() const CV_NODISCARD
Creates a full copy of the array and the underlying data.
cv::Scalar_< double >::all
static Scalar_< double > all(double v0)
returns a scalar with all elements set to v0
cv::TickMeter::stop
void stop()
stops counting ticks.
Definition: utility.hpp:310
cv::Mat::zeros
static MatExpr zeros(int rows, int cols, int type)
Returns a zero array of the specified size and type.
cv::Mat::at
_Tp & at(int i0=0)
Returns a reference to the specified array element.
cv::Tracker
Base abstract class for the long-term tracker.
Definition: tracking.hpp:713
cv::ORB::setMaxFeatures
virtual void setMaxFeatures(int maxFeatures)=0
cv::VideoCapture
Class for video capturing from video files, image sequences or cameras.
Definition: videoio.hpp:664
cv::waitKey
int waitKey(int delay=0)
Waits for a pressed key.
cv::Feature2D::detectAndCompute
virtual void detectAndCompute(InputArray image, InputArray mask, std::vector< KeyPoint > &keypoints, OutputArray descriptors, bool useProvidedKeypoints=false)
cv::Point_::y
_Tp y
y coordinate of the point
Definition: types.hpp:187
cv::DMatch
Class for matching keypoint descriptors.
Definition: types.hpp:808
cv::Point_::x
_Tp x
x coordinate of the point
Definition: types.hpp:186
cv::hconcat
void hconcat(const Mat *src, size_t nsrc, OutputArray dst)
Applies horizontal concatenation to given matrices.
highgui.hpp
cv::namedWindow
void namedWindow(const String &winname, int flags=WINDOW_AUTOSIZE)
Creates a window.
cv::findHomography
Mat findHomography(InputArray srcPoints, InputArray dstPoints, int method=0, double ransacReprojThreshold=3, OutputArray mask=noArray(), const int maxIters=2000, const double confidence=0.995)
Finds a perspective transformation between two planes.
cv::RANSAC
@ RANSAC
RANSAC algorithm.
Definition: calib3d.hpp:445
cv::resizeWindow
void resizeWindow(const String &winname, int width, int height)
Resizes the window to the specified size.
cv::Mat::empty
bool empty() const
Returns true if the array has no elements.
cv::TickMeter::start
void start()
starts counting ticks.
Definition: utility.hpp:304
cv::FONT_HERSHEY_PLAIN
@ FONT_HERSHEY_PLAIN
small size sans-serif font
Definition: imgproc.hpp:824
cv::TickMeter::getTimeSec
double getTimeSec() const
returns passed time in seconds.
Definition: utility.hpp:339
cv::Mat::size
MatSize size
Definition: mat.hpp:2118
cv::Rect_::y
_Tp y
y coordinate of the top-left corner
Definition: types.hpp:454
cv::VideoCapture::isOpened
virtual bool isOpened() const
Returns true if video capturing has been initialized already.
cv::selectROI
Rect selectROI(const String &windowName, InputArray img, bool showCrosshair=true, bool fromCenter=false)
Allows users to select a ROI on the given image.
cv::TickMeter
a Class to measure passing time.
Definition: utility.hpp:294
cv::Ptr
std::shared_ptr< _Tp > Ptr
Definition: cvstd_wrapper.hpp:23
cv::Rect_
Template class for 2D rectangles.
Definition: types.hpp:420
uchar
unsigned char uchar
Definition: interface.h:51
cv::imshow
void imshow(const String &winname, InputArray mat)
Displays an image in the specified window.
cv::Rect_::width
_Tp width
width of the rectangle
Definition: types.hpp:455
features2d.hpp
cv::Scalar
Scalar_< double > Scalar
Definition: types.hpp:669
cv::drawMatches
void drawMatches(InputArray img1, const std::vector< KeyPoint > &keypoints1, InputArray img2, const std::vector< KeyPoint > &keypoints2, const std::vector< DMatch > &matches1to2, InputOutputArray outImg, const Scalar &matchColor=Scalar::all(-1), const Scalar &singlePointColor=Scalar::all(-1), const std::vector< char > &matchesMask=std::vector< char >(), DrawMatchesFlags flags=DrawMatchesFlags::DEFAULT)
Draws the found matches of keypoints from two images.
cv::Rect_::height
_Tp height
height of the rectangle
Definition: types.hpp:456
cv::putText
void putText(InputOutputArray img, const String &text, Point org, int fontFace, double fontScale, Scalar color, int thickness=1, int lineType=LINE_8, bool bottomLeftOrigin=false)
Draws a text string.
cv::Point
Point2i Point
Definition: types.hpp:194
cv::vconcat
void vconcat(const Mat *src, size_t nsrc, OutputArray dst)
Applies vertical concatenation to given matrices.
cv::Mat
n-dimensional dense array class
Definition: mat.hpp:801
cv::AKAZE::setThreshold
virtual void setThreshold(double threshold)=0
cv::perspectiveTransform
void perspectiveTransform(InputArray src, OutputArray dst, InputArray m)
Performs the perspective matrix transformation of vectors.
cv::CommandLineParser
Designed for command line parsing.
Definition: utility.hpp:799
CV_8UC1
#define CV_8UC1
Definition: interface.h:88
cv
"black box" representation of the file storage associated with a file on disk.
Definition: affine.hpp:51
imgproc.hpp
cv::fillPoly
void fillPoly(InputOutputArray img, const Point **pts, const int *npts, int ncontours, const Scalar &color, int lineType=LINE_8, int shift=0, Point offset=Point())
cv::WINDOW_NORMAL
@ WINDOW_NORMAL
the user can resize the window (no constraint) / also use to switch a fullscreen window to a normal s...
Definition: highgui.hpp:187
videoio.hpp
cv::Rect_::x
_Tp x
x coordinate of the top-left corner
Definition: types.hpp:453
cv::VideoCapture::open
virtual bool open(const String &filename, int apiPreference=CAP_ANY)
Opens a video file or a capturing device or an IP video stream for video capturing.