CofeehousePy/deps/scikit-image/doc/examples/applications/plot_haar_extraction_select...

175 lines
7.0 KiB
Python

"""
======================================================
Face classification using Haar-like feature descriptor
======================================================
Haar-like feature descriptors were successfully used to implement the first
real-time face detector [1]_. Inspired by this application, we propose an
example illustrating the extraction, selection, and classification of Haar-like
features to detect faces vs. non-faces.
Notes
-----
This example relies on `scikit-learn <https://scikit-learn.org/>`_ for feature
selection and classification.
References
----------
.. [1] Viola, Paul, and Michael J. Jones. "Robust real-time face
detection." International journal of computer vision 57.2
(2004): 137-154.
https://www.merl.com/publications/docs/TR2004-043.pdf
:DOI:`10.1109/CVPR.2001.990517`
"""
import sys
from time import time
import numpy as np
import matplotlib.pyplot as plt
from dask import delayed
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from skimage.data import lfw_subset
from skimage.transform import integral_image
from skimage.feature import haar_like_feature
from skimage.feature import haar_like_feature_coord
from skimage.feature import draw_haar_like_feature
###########################################################################
# The procedure to extract the Haar-like features from an image is relatively
# simple. Firstly, a region of interest (ROI) is defined. Secondly, the
# integral image within this ROI is computed. Finally, the integral image is
# used to extract the features.
@delayed
def extract_feature_image(img, feature_type, feature_coord=None):
"""Extract the haar feature for the current image"""
ii = integral_image(img)
return haar_like_feature(ii, 0, 0, ii.shape[0], ii.shape[1],
feature_type=feature_type,
feature_coord=feature_coord)
###########################################################################
# We use a subset of CBCL dataset which is composed of 100 face images and
# 100 non-face images. Each image has been resized to a ROI of 19 by 19
# pixels. We select 75 images from each group to train a classifier and
# determine the most salient features. The remaining 25 images from each
# class are used to assess the performance of the classifier.
images = lfw_subset()
# To speed up the example, extract the two types of features only
feature_types = ['type-2-x', 'type-2-y']
# Build a computation graph using Dask. This allows the use of multiple
# CPU cores later during the actual computation
X = delayed(extract_feature_image(img, feature_types) for img in images)
# Compute the result
t_start = time()
X = np.array(X.compute(scheduler='threads'))
time_full_feature_comp = time() - t_start
# Label images (100 faces and 100 non-faces)
y = np.array([1] * 100 + [0] * 100)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=150,
random_state=0,
stratify=y)
# Extract all possible features
feature_coord, feature_type = \
haar_like_feature_coord(width=images.shape[2], height=images.shape[1],
feature_type=feature_types)
###########################################################################
# A random forest classifier can be trained in order to select the most
# salient features, specifically for face classification. The idea is to
# determine which features are most often used by the ensemble of trees.
# By using only the most salient features in subsequent steps, we can
# drastically speed up the computation while retaining accuracy.
# Train a random forest classifier and assess its performance
clf = RandomForestClassifier(n_estimators=1000, max_depth=None,
max_features=100, n_jobs=-1, random_state=0)
t_start = time()
clf.fit(X_train, y_train)
time_full_train = time() - t_start
auc_full_features = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
# Sort features in order of importance and plot the six most significant
idx_sorted = np.argsort(clf.feature_importances_)[::-1]
fig, axes = plt.subplots(3, 2)
for idx, ax in enumerate(axes.ravel()):
image = images[0]
image = draw_haar_like_feature(image, 0, 0,
images.shape[2],
images.shape[1],
[feature_coord[idx_sorted[idx]]])
ax.imshow(image)
ax.set_xticks([])
ax.set_yticks([])
_ = fig.suptitle('The most important features')
###########################################################################
# We can select the most important features by checking the cumulative sum
# of the feature importance. In this example, we keep the features
# representing 70% of the cumulative value (which corresponds to using only 3%
# of the total number of features).
cdf_feature_importances = np.cumsum(clf.feature_importances_[idx_sorted])
cdf_feature_importances /= cdf_feature_importances[-1] # divide by max value
sig_feature_count = np.count_nonzero(cdf_feature_importances < 0.7)
sig_feature_percent = round(sig_feature_count /
len(cdf_feature_importances) * 100, 1)
print(('{} features, or {}%, account for 70% of branch points in the '
'random forest.').format(sig_feature_count, sig_feature_percent))
# Select the determined number of most informative features
feature_coord_sel = feature_coord[idx_sorted[:sig_feature_count]]
feature_type_sel = feature_type[idx_sorted[:sig_feature_count]]
# Note: it is also possible to select the features directly from the matrix X,
# but we would like to emphasize the usage of `feature_coord` and `feature_type`
# to recompute a subset of desired features.
# Build the computational graph using Dask
X = delayed(extract_feature_image(img, feature_type_sel, feature_coord_sel)
for img in images)
# Compute the result
t_start = time()
X = np.array(X.compute(scheduler='threads'))
time_subs_feature_comp = time() - t_start
y = np.array([1] * 100 + [0] * 100)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=150,
random_state=0,
stratify=y)
###########################################################################
# Once the features are extracted, we can train and test a new classifier.
t_start = time()
clf.fit(X_train, y_train)
time_subs_train = time() - t_start
auc_subs_features = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
summary = (('Computing the full feature set took {:.3f}s, plus {:.3f}s '
'training, for an AUC of {:.2f}. Computing the restricted '
'feature set took {:.3f}s, plus {:.3f}s training, '
'for an AUC of {:.2f}.')
.format(time_full_feature_comp, time_full_train,
auc_full_features, time_subs_feature_comp,
time_subs_train, auc_subs_features))
print(summary)
plt.show()