PCA主要是通过奇异值分解将数据映射到低纬度的空间(正交去相关)。PCA在数据降维,数据压缩,特征提取有很大贡献。在此,我们利用PCA提取150个主要特征,并将人脸数据全部映射到150维度,通过这150维人脸特征作为训练数据训练基于rbf kernel的SVM,模型差不多有0.85的准确率。



def fetch_lfw_people(data_home=None, funneled=True, resize=0.5,                     min_faces_per_person=0, color=False,                     slice_=(slice(70, 195), slice(78, 172)),                     download_if_missing=True):    """Loader for the Labeled Faces in the Wild (LFW) people dataset    This dataset is a collection of JPEG pictures of famous people    collected on the internet, all details are available on the    official website:        http://vis-www.cs.umass.edu/lfw/    Each picture is centered on a single face. Each pixel of each channel    (color in RGB) is encoded by a float in range 0.0 - 1.0.    The task is called Face Recognition (or Identification): given the    picture of a face, find the name of the person given a training set    (gallery).    The original images are 250 x 250 pixels, but the default slice and resize    arguments reduce them to 62 x 74.    Parameters    ----------    data_home : optional, default: None        Specify another download and cache folder for the datasets. By default        all scikit learn data is stored in '~/scikit_learn_data' subfolders.    funneled : boolean, optional, default: True        Download and use the funneled variant of the dataset.    resize : float, optional, default 0.5        Ratio used to resize the each face picture.    min_faces_per_person : int, optional, default None        The extracted dataset will only retain pictures of people that have at        least `min_faces_per_person` different pictures.    color : boolean, optional, default False        Keep the 3 RGB channels instead of averaging them to a single        gray level channel. If color is True the shape of the data has        one more dimension than the shape with color = False.    slice_ : optional        Provide a custom 2D slice (height, width) to extract the        'interesting' part of the jpeg files and avoid use statistical        correlation from the background    download_if_missing : optional, True by default        If False, raise a IOError if the data is not locally available        instead of trying to download the data from the source site.    Returns    -------    dataset : dict-like object with the following attributes:    dataset.data : numpy array of shape (13233, 2914)        Each row corresponds to a ravelled face image of original size 62 x 47        pixels. Changing the ``slice_`` or resize parameters will change the        shape of the output.    dataset.images : numpy array of shape (13233, 62, 47)        Each row is a face image corresponding to one of the 5749 people in        the dataset. Changing the ``slice_`` or resize parameters will change        the shape of the output.    dataset.target : numpy array of shape (13233,)        Labels associated to each face image. Those labels range from 0-5748        and correspond to the person IDs.    dataset.DESCR : string        Description of the Labeled Faces in the Wild (LFW) dataset.    """




2、 获取人名信息target_names,人名对应的人脸信息faces,target,首先看一下人脸数据集的结构


person_names, file_paths = [], []   #人名集合和人脸图片路径# 将每个人名文件下的人脸图片路径保存到paths    for person_name in sorted(listdir(data_folder_path)):        folder_path = join(data_folder_path, person_name)        if not isdir(folder_path):            continue        paths = [join(folder_path, f) for f in listdir(folder_path)]        n_pictures = len(paths)        if n_pictures >= min_faces_per_person:            person_name = person_name.replace('_', ' ')            # 这儿用extend是因为extend可以一次性添加多个元素,append一次只能添加一个            person_names.extend([person_name] * n_pictures)            file_paths.extend(paths)    n_faces = len(file_paths)    if n_faces == 0:        raise ValueError("min_faces_per_person=%d is too restrictive" %                         min_faces_per_person)    # 通过unique函数得到所有不同人名    target_names = np.unique(person_names)    # searchsorted是寻求插入位置的函数,在这儿巧妙的将person_names数字化,target代表person_names中每各名字在target_names的位置    target = np.searchsorted(target_names, person_names)    # 加载人脸,slice_是对人脸切片,lolor用来指定加载彩色还是黑白图片,resize为缩放比例    faces = _load_imgs(file_paths, slice_, color, resize)    # shuffle the faces with a deterministic RNG scheme to avoid having    # all faces of the same person in a row, as it would break some    # cross validation and learning algorithms such as SGD and online    # k-means that make an IID assumption    # 这儿就是打乱人脸顺序的工作了    indices = np.arange(n_faces)    np.random.RandomState(42).shuffle(indices)    faces, target = faces[indices], target[indices]    return faces, target, target_names




# Try to import imread and imresize from PIL. We do this here to prevent    # the whole sklearn.datasets module from depending on PIL.    try:        try:            from scipy.misc import imread        except ImportError:            from scipy.misc.pilutil import imread        from scipy.misc import imresize    except ImportError:        raise ImportError("The Python Imaging Library (PIL)"                          " is required to load data from jpeg files")





