图像分类¶

resnet_v1d_50¶

##-*- encoding:utf-8 -*-
# Author: wenmeng.zwm@alibaba-inc.com
# Date: 2018-09-06
# classfication_resnet50.config:  config resnet50 model for configuration
#

model_config {
  model_class: 'Classification'
  classification {
    backbone {
      class_name: 'resnet_v1d_50'
      weight_decay: 0.0001
    } 
    num_classes: 1000
    loss {
      weighted_softmax {
      }
    }
    label_id_offset: 1
  }
}

train_config: {
  optimizer {
    momentum_optimizer: {
       learning_rate: {
         manual_step_learning_rate {
           initial_learning_rate: 0.0
           schedule {
             step: 5000
             learning_rate: 0.1
           }
           schedule {
             step: 200000 
             learning_rate: 0.01
           }
           schedule {
             step: 400000 
             learning_rate: 0.001
           }
            schedule {
             step: 600000
             learning_rate: 0.0001
           }
           warmup: true
         }
       }
       momentum_optimizer_value: 0.9
    }
  }

  #gradient_clipping_by_norm : 10.0

  #distribute training setting
  sync_replicas: true
  #using 8 gpu
  replicas_to_aggregate:8
  num_worker_replicas: 8 

  num_steps: 800000 
  model_dir: 'experiments/imagenet_resnet50_dis/train'
}

train_data: {
  input_path: "data/imagenet_tfrecord/train-*"
  batch_size: 32 
  num_readers: 4
  shuffle: true
  read_block_length: 32
  classification_decoder_config{
  }
  data_augmentation_options {
    random_distort_color {
      color_ordering: 0
      fast_mode: true
    }
  }
  data_augmentation_options {
    vgg_preprocessing {
      is_training: true
    }
  }
}

eval_config: {
  num_examples: 50000 
  # Note: The below line limits the evaluation process to 10 evaluations.
  # Remove the below line to evaluate indefinitely.
  # max_evals: 10
  metrics_set :  'classification_metric'
}

eval_data : {
  input_path: "data/imagenet_tfrecord/validation-*"
  batch_size: 32 
  shuffle: false
  num_readers: 1
  classification_decoder_config{
    label_map_path: 'data/imagenet_tfrecord/imagenet_labelmap.pbtxt'
  }

  data_augmentation_options {
    vgg_preprocessing {
      is_training: false 
    }
  }
}

mobilenet_v3¶

##-*- encoding:utf-8 -*-
# Author: wenmeng.zwm@alibaba-inc.com
# Date: 2018-09-06
# classfication_resnet50.config:  config resnet50 model for configuration
#

model_config {
  model_class: 'Classification'
  classification {
    backbone {
      class_name: 'mobilenet_v3'
      weight_decay: 0.00004
      depth_multiplier: 1.0
    } 
    num_classes: 1001
    loss {
      weighted_softmax {
      }
    }
    label_id_offset: 0
  }
}

train_config: {
  optimizer {
    momentum_optimizer: {
       learning_rate: {
         manual_step_learning_rate {
           initial_learning_rate: 0.1
           schedule {
             step: 200000 
             learning_rate: 0.01
           }
           schedule {
             step: 400000 
             learning_rate: 0.001
           }
            schedule {
             step: 550000 
             learning_rate: 0.0001
           }
          }
       }
       momentum_optimizer_value: 0.9
    }
  }


  #gradient_clipping_by_norm : 10.0
  num_steps: 800000 
  model_dir: 'experiments/imagenet/output/imagenet_mobilenet_v3'
}

train_data: {
  input_path: "data/imagenet/tfrecords/train-*"
  batch_size: 256 
  num_readers: 4
  shuffle: true
  read_block_length : 256
  classification_decoder_config{
  }

  data_augmentation_options {
    inception_preprocessing {
      is_training: true
    }
  }
}

eval_config: {
  num_examples: 50000 
  # Note: The below line limits the evaluation process to 10 evaluations.
  # Remove the below line to evaluate indefinitely.
  # max_evals: 10
  num_visualizations: 0
  metrics_set :  'classification_metric'
}

eval_data : {
  input_path: "data/imagenet/tfrecords/validation-*"
  batch_size: 100
  shuffle: false
  num_readers: 1
  classification_decoder_config{
    label_map_path: 'data/imagenet/tfrecords/imagenet_labelmap.pbtxt'
  }

  data_augmentation_options {
    inception_preprocessing {
      is_training: false
    }
  }
}

efficientnet_b0¶

##-*- encoding:utf-8 -*-
# Author: hongsheng.jhs@alibaba-inc.com
# Date: 2020-02-10

model_config {
  model_class: 'Classification'
  classification {
    backbone {
      class_name: 'efficientnet-b0'
      weight_decay: 0.00001
      connect_survival_prob: 0.8
    } 
    num_classes: 1000
    loss {
      weighted_softmax {
      }
    }
    label_id_offset: 1
  }
}

train_config: {
  optimizer {
    rms_prop_optimizer: {
       learning_rate: {
         # decay by 0.97 every 2.4 epochs
         exponential_decay_learning_rate {
           initial_learning_rate: 0.032
           decay_steps: 6000
           decay_factor: 0.97
          }
       }
    }
    use_moving_average: false
  }
  num_steps: 900000     # total 150 epochs
  save_checkpoints_steps: 5000
  model_dir: 'experiments/imagenet/output/imagenet_efficientnet_b0/'
  sync_replicas: true
  replicas_to_aggregate:8
  num_worker_replicas: 8
}

train_data: {
  input_path: "data/imagenet/tfrecords/train-*"
  batch_size: 64 
  num_readers: 8
  shuffle: true
  read_block_length : 32 
  classification_decoder_config{
  }

  data_augmentation_options {
    efficientnet_preprocessing {
      model_name: 'efficientnet-b0' # use default image size for the model
      is_training: true

    }
  }
}

eval_config: {
  num_examples: 50000
  metrics_set :  'classification_metric'
}

eval_data : {
  input_path: "data/imagenet/tfrecords/validation-*"
  batch_size: 100
  read_block_length : 100
  shuffle: false
  num_readers: 1
  classification_decoder_config{
    label_map_path: 'data/imagenet/imagenet_labelmap.pbtxt'
  }

  data_augmentation_options {
    efficientnet_preprocessing {
      model_name: 'efficientnet-b0' # use default image size for the model
      is_training: false
    }
  }
}

darknet53¶

##-*- encoding:utf-8 -*-
# Author: hongsheng.jhs@alibaba-inc.com
# Date: 2020-02-10

model_config {
  model_class: 'Classification'
  classification {
    backbone {
      class_name: 'darknet53'
      weight_decay: 0.00001
    } 
    num_classes: 1000
    loss {
      weighted_softmax {
      }
    }
    label_id_offset: 1
  }
}

train_config: {
  optimizer {
    momentum_optimizer: {
      learning_rate: {
        # decay by 0.97 every 2.4 epochs
        exponential_decay_learning_rate {
          # initial_learning_rate: 0.032
          initial_learning_rate: 0.0
          decay_steps: 6000
          decay_factor: 0.97
         }
      }
      momentum_optimizer_value: 0.9
    }
    use_moving_average: false
  }
  num_steps: 1 # 900000     # total 150 epochs
  save_checkpoints_steps: 5000
  model_dir: 'experiments/imagenet/output/imagenet_darknet53/'
}

train_data: {
  input_path: "data/imagenet/tfrecords/train-*"
  batch_size: 64 
  num_readers: 8
  shuffle: true
  read_block_length : 32 
  classification_decoder_config{
  }

  data_augmentation_options {
    classification_random_crop {
    }
  }

  data_augmentation_options {
    resize_image {
      new_height: 256
      new_width: 256
    }
  }

  data_augmentation_options {
    normalize_image {
      original_minval: 0.0
      original_maxval: 255.0
      target_minval: 0.0
      target_maxval: 1.0
    }
  }
}

eval_config: {
  num_examples: 50000
  metrics_set :  'classification_metric'
}

eval_data : {
  input_path: "data/imagenet/tfrecords/validation-*"
  batch_size: 100
  read_block_length : 100
  shuffle: false
  num_readers: 1
  classification_decoder_config{
    label_map_path: 'data/imagenet/imagenet_labelmap.pbtxt'
  }

  data_augmentation_options {
    classification_central_crop {
    }
  }

  data_augmentation_options {
    resize_image {
      new_height: 224 #256
      new_width: 224 #256
    }
  }

  data_augmentation_options {
    normalize_image {
      original_minval: 0.0
      original_maxval: 255.0
      target_minval: 0.0
      target_maxval: 1.0
    }
  }
}

物体检测¶

faster_rcnn_r50¶

#-*- encoding:utf-8 -*-
# Author: mengli.cml@alibaba-inc.com
# Date: 2018_06-22
# simple_rpn.config: encode the configs used in a simple rpn model

train_config: {
  optimizer {
    momentum_optimizer: {
      learning_rate: {
        manual_step_learning_rate {
          initial_learning_rate: 0.00001
          schedule {
            step: 100
            learning_rate: 0.001
          }
          schedule {
            step: 90000
            learning_rate: .0001
          }
          schedule {
            step: 120000
            learning_rate: .00001
          }
          warmup: true
        }
      }
      momentum_optimizer_value: 0.9
    }
    use_moving_average: false
  }
  fine_tune_checkpoint: "pretrained_models/resnet_v1d_50/model.ckpt"
  num_steps: 150000
  model_dir: "pascal_resnet50_frcnn_model"
}

train_data: {
  # [0-7] evenly split into 8 parts
  input_path: "data/voc0712_tfrecord/voc0712_part_*.tfrecord"
  batch_size: 2
  num_readers: 4
  read_block_length: 1
  shuffle: true
  voc_decoder_config {
    label_map_path: "data/voc0712_tfrecord/pascal_label_map.pbtxt"
  }
  # note the augmentation order is important, so it cannot be changed
  data_augmentation_options {
    random_horizontal_flip {
    }
  }

  data_augmentation_options {
    random_resize_to_range {
      min_sizes: 600
      max_sizes: 1024
    }
  }

  data_augmentation_options {
    subtract_channel_mean {
      # see https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/fast_rcnn/config.py#L181
      means: 123.68
      means: 116.779
      means: 103.939
    }
  }
  use_diff: false
}

eval_config: {
  num_examples: 4952
  max_evals: 1000
  num_visualizations: 100
  #metrics_set: 'coco_detection_metrics'
  metrics_set: 'pascal_voc_detection_metrics'
  metrics_set: 'pascal_voc07_detection_metrics'
}

eval_data: {
  input_path: "data/voc0712_tfrecord/VOC2007_test.tfrecord"
  batch_size: 1
  shuffle: false
  num_readers: 1
  voc_decoder_config {
    label_map_path: "data/voc0712_tfrecord/pascal_label_map.pbtxt"
  }
  # note the augmentation order is important, so it cannot be changed
  data_augmentation_options {
    random_resize_to_range {
      min_sizes: 600
      max_sizes: 1024
    }
  }
  data_augmentation_options {
    subtract_channel_mean {
      # see https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/fast_rcnn/config.py#L181
      means: 123.68
      means: 116.779
      means: 103.939
    }
  }
  use_diff: true
}

export_config {
  batch_size: 1
}

model_config: {
  model_class: 'FasterRcnn'
  faster_rcnn {
    backbone {
      class_name: 'resnet_v1d_50'
      batchnorm_trainable: false
      weight_decay: 0.0001
      output_stride: 16
    }
    rpn_head {
      input_layer: 'resnet_v1d_50/block3'
      box_predictor {
        convolutional_box_predictor {
          conv_hyperparams {
            op: CONV
            regularizer {
              l2_regularizer {
                weight: 0.0001
              }
            }
            initializer {
              truncated_normal_initializer {
                stddev: 0.01
              }
            }
          }
          min_depth: 512
          max_depth: 512
          num_layers_before_predictor: 1
          kernel_size: 3
        }
      }
      first_stage_minibatch_size: 256
      first_stage_positive_balance_fraction: 0.5
      first_stage_nms_iou_threshold: 0.7
      first_stage_max_proposals: 300
      rpn_min_size: 16
      first_stage_anchor_generator {
        # the default base anchor size is 256
        grid_anchor_generator {
          scales: [0.5, 1.0, 2.0]
          aspect_ratios: [0.5, 1.0, 2.0]
          height_stride: 16
          width_stride: 16
        }
      }
    }
    region_feature_extractor {
      resnet_block {
        class_name: 'resnet_v1d_50' #the name of backbone
        block_name: 'block4'  #the last residual block of resnet_v1d_50
        stride: 1
        weight_decay: 0.0001
      }
    }
    rcnn_head {
      input_layer: 'resnet_v1d_50/block3'
      initial_crop_size: 14
      maxpool_kernel_size: 2
      maxpool_stride: 2
      num_classes: 20
      second_stage_box_predictor {
        mask_rcnn_box_predictor {
          fc_hyperparams {
            op: FC
            regularizer {
              l2_regularizer {
                weight: 0.0001
              }
            }
            initializer {
              xavier_initializer {
              }
            }
          }
          agnostic: true
        }
      }
      nms_config {
        score_threshold: 0.0
        iou_threshold: 0.3
        max_detections_per_class: 400
        max_total_detections: 400
      }
  
      second_stage_batch_size: 128
      second_stage_balance_fraction: 0.25
    } 
  }
}

faster_rcnn_r50_fpn¶

#-*- encoding:utf-8 -*-
# Author: mengli.cml@alibaba-inc.com
# Date: 2018_06-22
# simple_rpn.config: encode the configs used in a simple rpn model

train_config: {
  optimizer {
    momentum_optimizer: {
      learning_rate: {
        manual_step_learning_rate {
          initial_learning_rate: 0.00001
          schedule {
            step: 100
            learning_rate: 0.001
          }
          schedule {
            step: 90000
            learning_rate: .0001
          }
          schedule {
            step: 120000
            learning_rate: .00001
          }
          warmup: true
        }
      }
      momentum_optimizer_value: 0.9
    }
    use_moving_average: false
  }
  fine_tune_checkpoint: "pretrained_models/resnet_v1d_50/model.ckpt"
  num_steps: 150000
  model_dir: "pascal_resnet50_frcnn_model_fpn"
  log_step_count_steps: 1
}

train_data: {
  input_path: "data/voc0712_tfrecord/voc0712_part_*.tfrecord"
  batch_size: 2
  num_readers: 4
  read_block_length: 1
  shuffle: true
  voc_decoder_config {
    label_map_path: "data/voc0712_tfrecord/pascal_label_map.pbtxt"
  }
  # note the augmentation order is important, so it cannot be changed
  data_augmentation_options { 
    random_horizontal_flip {
    }
  }

  data_augmentation_options {
    random_resize_to_range {
      min_sizes: 600
      max_sizes: 1024
    }
  }

  data_augmentation_options {
    subtract_channel_mean {
      # see https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/fast_rcnn/config.py#L181
      means: 123.68
      means: 116.779
      means: 103.939
    }
  }
  use_diff: false
}
    
eval_config: {
  num_examples: 4952
  max_evals: 1000
  num_visualizations: 100
  #metrics_set: 'coco_detection_metrics'
  metrics_set: 'pascal_voc_detection_metrics'
  metrics_set: 'pascal_voc07_detection_metrics'
}

eval_data: {
  input_path: "data/voc0712_tfrecord/VOC2007_test.tfrecord"
  batch_size: 1
  shuffle: false
  num_readers: 1
  voc_decoder_config {
    label_map_path: "data/voc0712_tfrecord/pascal_label_map.pbtxt"
  }
  # note the augmentation order is important, so it cannot be changed
  data_augmentation_options {
    random_resize_to_range {
      min_sizes: 600 
      max_sizes: 1024
    }
  }
  data_augmentation_options {
    subtract_channel_mean {
      # see https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/fast_rcnn/config.py#L181
      means: 123.68
      means: 116.779
      means: 103.939
    }
  }
  use_diff: true
}

export_config {
  batch_size: 1
}

model_config: {
  model_class: 'FasterRcnn'
  faster_rcnn {
    backbone {
      class_name: 'resnet_v1d_50'
      batchnorm_trainable: false
      weight_decay: 0.0001
    } 
    fpn {
      input: 'resnet_v1d_50/block1'
      input: 'resnet_v1d_50/block2'
      input: 'resnet_v1d_50/block3'
      input: 'resnet_v1d_50/block4'
      fea_dim: 256
      extra_conv_layers: 1
      roi_min_level: 2
      roi_max_level: 5
      conv_hyperparams {
        op: CONV
        regularizer {
          l2_regularizer {
            weight: 0.0001
          }
        }
        initializer {
          truncated_normal_initializer {
            stddev: 0.01
          }
        }
      }
    }
    rpn_head {
      # if input_layer is not specified, will use fpn features,
      # which all have "FPN/" prefix
      box_predictor {
        weight_shared_convolutional_box_predictor {
          conv_hyperparams {
            op: CONV
            regularizer {
              l2_regularizer {
                weight: 0.0001
              }
            }
            initializer {
              truncated_normal_initializer {
                stddev: 0.01
              }
            }
          }
          depth: 512
          num_layers_before_predictor: 1
          kernel_size: 3
        }
      }
      first_stage_minibatch_size: 256
      first_stage_positive_balance_fraction: 0.5
      first_stage_nms_iou_threshold: 0.7
      first_stage_max_proposals: 300
      rpn_min_size: 16
      first_stage_anchor_generator {
        # anchor_size = anchor_scale * feature_map_stride
        multiscale_anchor_generator {
          min_level: 2
          max_level: 6
          anchor_scale: 8
          aspect_ratios: 0.5
          aspect_ratios: 1
          aspect_ratios: 2
          normalize_coordinates: false
          scales_per_octave: 1
        }
      }
    }
    rcnn_head {
      initial_crop_size: 14
      maxpool_kernel_size: 2 
      maxpool_stride: 2
      num_classes: 20

      second_stage_box_predictor {
        mask_rcnn_box_predictor {
          depth: 1024
          num_layers_before_predictor: 2
          fc_hyperparams {
            op: FC
            regularizer {
              l2_regularizer {
                weight: 0.0001
              }
            }
            initializer {
              xavier_initializer {
              }
            }
          }
          agnostic: true
        }
      }
      nms_config {
        score_threshold: 0.0
        iou_threshold: 0.3
        max_detections_per_class: 400
        max_total_detections: 400
      }
  
      second_stage_batch_size: 128
      second_stage_balance_fraction: 0.25
    } 
  }
}

ssd_r50¶

##-*- encoding:utf-8 -*-
# Author: wenmeng.zwm@alibaba-inc.com
# Date: 2018-08-22
#
#
#
# SSD with Resnet50 configuration for VOC Dataset.
# Users should configure the fine_tune_checkpoint field in the train config as
# well as the label_map_path and input_path fields in the train_input_reader and
# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that
# should be configured.

model_config {
  model_class: 'SSD'
  ssd {
    backbone {
      class_name: "resnet_v1d_50"
      output_stride: 16
    }
    ssd_head {
      num_classes: 20
      ssd_featuremap_layout {
        from_layer: 'resnet_v1d_50/block3'
        from_layer: 'resnet_v1d_50/block4'
        from_layer: ''
        from_layer: ''
        from_layer: ''
        from_layer: ''
        layer_depth: -1
        layer_depth: -1 
        layer_depth: 512 
        layer_depth: 512 
        layer_depth: 256 
        layer_depth: 256 
      }
      
      #min_depth: 16
      #depth_multiplier: 1.0
      conv_hyperparams {
        activation: RELU,
        regularizer {
          l2_regularizer {
            weight: 0.0005
          }
        }
        initializer {
          truncated_normal_initializer {
            stddev: 0.03
            mean: 0.0
          }
        }
        batch_norm {
          train: true,
          scale: true,
          center: true,
          decay: 0.9997,
          epsilon: 0.001,
        }
      }

      box_coder {
        faster_rcnn_box_coder {
          y_scale: 10.0
          x_scale: 10.0
          height_scale: 5.0
          width_scale: 5.0
        }
      }
      matcher {
        argmax_matcher {
          matched_threshold: 0.5
          unmatched_threshold: 0.5
          ignore_thresholds: false
          negatives_lower_than_unmatched: true
          force_match_for_each_row: true
        }
      }
      similarity_calculator {
        iou_similarity {
        }
      }
      anchor_generator {
        ssd_anchor_generator {
          num_layers: 6
          #min_scale: 0.2
          #max_scale: 0.9

          #use caffe anchor scale, the last one is [0.88, 1.0]
          scales: 0.1
          scales: 0.2
          scales: 0.37
          scales: 0.54
          scales: 0.71
          scales: 0.88 
          scales: 1.0 
          aspect_ratios: 1.0
          aspect_ratios: 2.0
          aspect_ratios: 0.5
          aspect_ratios: 3.0
          aspect_ratios: 0.3333
          reduce_boxes_in_lowest_layer: true
          reduce_boxes_in_larger_layers: true 
          interpolate_in_all_layers: true
        }
      }
      box_predictor {
        convolutional_box_predictor {
          min_depth: 0
          max_depth: 0
          num_layers_before_predictor: 0
          kernel_size: 3
          box_code_size: 4
          conv_hyperparams {
            #activation: RELU_6,
            activation: NONE,
            regularizer {
              l2_regularizer {
                weight: 0.0005
              }
            }
            initializer {
              #truncated_normal_initializer {
              #  stddev: 0.03
              #  mean: 0.0
              #}
              xavier_initializer {
                uniform : false 
              }
            }
          }
        }
      }

      post_processing {
        batch_non_max_suppression {
          score_threshold: 0.01 
          iou_threshold: 0.45
          max_detections_per_class: 100
          max_total_detections: 200
        }
        score_converter: SOFTMAX 
      }

      normalize_loss_by_num_matches: true
      loss {
        classification_loss {
          weighted_softmax {
          }
        }
        localization_loss {
          weighted_smooth_l1 {
          }
        }
        hard_example_miner {
          num_hard_examples: 3000
          iou_threshold: 0.99
          loss_type: BOTH 
          max_negatives_per_positive: 3
          min_negatives_per_image: 0
        }
        classification_weight: 3.0
        localization_weight: 1.0
      }
    }
  }
}

train_config: {
  optimizer {
    momentum_optimizer: {
       learning_rate: {
         manual_step_learning_rate {
           initial_learning_rate: 0.001
           schedule {
             step: 80000
             learning_rate: 0.0001
           }
           schedule {
             step: 100000
             learning_rate: 0.00001
           }
         }
       }
       momentum_optimizer_value: 0.9
    }
  }

  #gradient_clipping_by_norm : 10.0
  fine_tune_checkpoint: "pretrained_models/resnet_v1d_50/model.ckpt"
  num_steps: 120000
  model_dir: 'experiments/ssd_resnet50/train'
}

train_data: {
  input_path: "data/voc0712_tfrecord/VOC2007_train.tfrecord"
  input_path: "data/voc0712_tfrecord/VOC2012_train.tfrecord"
  input_path: "data/voc0712_tfrecord/VOC2007_val.tfrecord"
  input_path: "data/voc0712_tfrecord/VOC2012_val.tfrecord"
  batch_size: 32
  num_readers: 4
  shuffle: true
  read_block_length : 32
  voc_decoder_config {
    label_map_path: "data/voc0712_tfrecord/pascal_label_map.pbtxt"
  }

  #data argumentation
  data_augmentation_options {
    ssd_random_crop {
    }
  }  
  data_augmentation_options {
    random_adjust_brightness {
      max_delta:0.125 
    }
  }  
  data_augmentation_options {
    random_adjust_contrast {
      min_delta : 0.5
      max_delta : 1.5
    }
  }
  data_augmentation_options {
    random_adjust_hue {
      max_delta : 0.046875 
    }
  }
  data_augmentation_options {
    random_adjust_saturation {
      min_delta : 0.5 
      max_delta : 1.5
    }
  }
  data_augmentation_options {
    random_horizontal_flip {
    }
  }

  data_augmentation_options {
    resize_image {
      new_height: 300
      new_width: 300
      method: BILINEAR
    }
  }

  data_augmentation_options {
    subtract_channel_mean {
      # see https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_pascal.py#L177
      means: 123.68
      means: 116.779
      means: 103.939
    } 
  }


}

eval_config: {
  num_examples: 4592 
  # Note: The below line limits the evaluation process to 10 evaluations.
  # Remove the below line to evaluate indefinitely.
  # max_evals: 10
  metrics_set :  'pascal_voc07_detection_metrics'
  #metrics_set :  'coco_detection_metrics'
  visualize_groundtruth_boxes : true
  # num of visualizations to be displayed on tensorboard
  num_visualizations : 10 
  # all the evaluation results will be saved to this dir if not ''
  visualization_export_dir: ''
  max_num_boxes_to_visualize: 20 
  min_score_threshold: 0.5
}

eval_data : {
  input_path: "data/voc0712_tfrecord/VOC2007_test.tfrecord"
  batch_size: 1
  shuffle: false
  num_readers: 1
  voc_decoder_config {
    label_map_path: "data/voc0712_tfrecord/pascal_label_map.pbtxt"
  }

  data_augmentation_options {
    resize_image {
      new_height: 300
      new_width: 300
      method: BILINEAR
    }
  }

  data_augmentation_options {
    subtract_channel_mean {
      # see https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_pascal.py#L177
      means: 123.68
      means: 116.779
      means: 103.939
    } 
  }

}
export_config {
  batch_size: 1
}

ssd_r50_fpn¶

##-*- encoding:utf-8 -*-
# Author: wenmeng.zwm@alibaba-inc.com
# Date: 2018-08-22
#
#
#
# SSD with Resnet50 configuration for VOC Dataset.
# Users should configure the fine_tune_checkpoint field in the train config as
# well as the label_map_path and input_path fields in the train_input_reader and
# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that
# should be configured.

model_config {
  model_class: 'SSD'
  ssd {
    backbone {
      class_name: "resnet_v1d_50"
    }
    ssd_head {
      num_classes: 20
      fpn_featuremap_layout {
        from_layer: 'resnet_v1d_50/block2'
        from_layer: 'resnet_v1d_50/block3'
        from_layer: 'resnet_v1d_50/block4'
        layer_depth: 256
        extra_conv_layers: 2        
      }
      
      conv_hyperparams {
        activation: RELU,
        regularizer {
          l2_regularizer {
            weight: 0.0005
          }
        }
        initializer {
          truncated_normal_initializer {
            stddev: 0.03
            mean: 0.0
          }
        }
        batch_norm {
          train: true,
          scale: true,
          center: true,
          decay: 0.9997,
          epsilon: 0.001,
        }
      }

      box_coder {
        faster_rcnn_box_coder {
          y_scale: 10.0
          x_scale: 10.0
          height_scale: 5.0
          width_scale: 5.0
        }
      }
      matcher {
        argmax_matcher {
          matched_threshold: 0.5
          unmatched_threshold: 0.5
          ignore_thresholds: false
          negatives_lower_than_unmatched: true
          force_match_for_each_row: true
        }
      }
      similarity_calculator {
        iou_similarity {
        }
      }
      anchor_generator {
        multiscale_anchor_generator{
          min_level: 3
          max_level: 7
          anchor_scale: 4.0
          aspect_ratios: [0.5, 1.0, 2.0, 0.333, 3.0]
          scales_per_octave: 2 
          normalize_coordinates: true
        }
      }
      box_predictor {
        weight_shared_convolutional_box_predictor{
          depth: 256
          num_layers_before_predictor: 4
          kernel_size: 3
          box_code_size: 4
          conv_hyperparams {
            #activation: RELU_6,
            activation: NONE,
            regularizer {
              l2_regularizer {
                weight: 0.0005
              }
            }
            initializer {
              xavier_initializer {
                uniform : false 
              }
            }
            batch_norm {
               scale: true,
               decay: 0.9997,
               epsilon: 0.001,
            }
          }
        }
      }

      post_processing {
        batch_non_max_suppression {
          score_threshold: 0.01 
          iou_threshold: 0.45
          max_detections_per_class: 100
          max_total_detections: 200
        }
        score_converter: SOFTMAX 
      }

      normalize_loss_by_num_matches: true
      loss {
        classification_loss {
          weighted_softmax {
          }
        }
        localization_loss {
          weighted_smooth_l1 {
          }
        }
        hard_example_miner {
          num_hard_examples: 3000
          iou_threshold: 0.99
          loss_type: BOTH 
          max_negatives_per_positive: 3
          min_negatives_per_image: 0
        }
        classification_weight: 3.0
        localization_weight: 1.0
      }
    }
  }
}

train_config: {
  optimizer {
    momentum_optimizer: {
       learning_rate: {
         manual_step_learning_rate {
           initial_learning_rate: 0.001
           schedule {
             step: 80000
             learning_rate: 0.0001
           }
           schedule {
             step: 100000
             learning_rate: 0.00001
           }
         }
       }
       momentum_optimizer_value: 0.9
    }
  }

  #gradient_clipping_by_norm : 10.0
  fine_tune_checkpoint: "pretrained_models/resnet_v1d_50/model.ckpt"
  num_steps: 120000
  model_dir: 'experiments/ssd_resnet50_fpn'
}

train_data: {
  input_path: "data/voc0712_tfrecord/VOC2007_train.tfrecord"
  input_path: "data/voc0712_tfrecord/VOC2012_train.tfrecord"
  input_path: "data/voc0712_tfrecord/VOC2007_val.tfrecord"
  input_path: "data/voc0712_tfrecord/VOC2012_val.tfrecord"
  batch_size: 32
  num_readers: 4
  shuffle: true
  read_block_length : 32
  voc_decoder_config {
    label_map_path: "data/voc0712_tfrecord/pascal_label_map.pbtxt"
  }

  #data argumentation
  data_augmentation_options {
    ssd_random_crop {
    }
  }  
  data_augmentation_options {
    random_adjust_brightness {
      max_delta:0.125 
    }
  }  
  data_augmentation_options {
    random_adjust_contrast {
      min_delta : 0.5
      max_delta : 1.5
    }
  }
  data_augmentation_options {
    random_adjust_hue {
      max_delta : 0.046875 
    }
  }
  data_augmentation_options {
    random_adjust_saturation {
      min_delta : 0.5 
      max_delta : 1.5
    }
  }
  data_augmentation_options {
    random_horizontal_flip {
    }
  }

  data_augmentation_options {
    resize_image {
      new_height: 300
      new_width: 300
      method: BILINEAR
    }
  }

  data_augmentation_options {
    subtract_channel_mean {
      # see https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_pascal.py#L177
      means: 123.68
      means: 116.779
      means: 103.939
    } 
  }


}

eval_config: {
  num_examples: 4592 
  # Note: The below line limits the evaluation process to 10 evaluations.
  # Remove the below line to evaluate indefinitely.
  # max_evals: 10
  metrics_set :  'pascal_voc07_detection_metrics'
  #metrics_set :  'coco_detection_metrics'
  visualize_groundtruth_boxes : true
  # num of visualizations to be displayed on tensorboard
  num_visualizations : 10 
  # all the evaluation results will be saved to this dir if not ''
  visualization_export_dir: ''
  max_num_boxes_to_visualize: 20 
  min_score_threshold: 0.5
}

eval_data : {
  input_path: "data/voc0712_tfrecord/VOC2007_test.tfrecord"
  batch_size: 1
  shuffle: false
  num_readers: 1
  voc_decoder_config {
    label_map_path: "data/voc0712_tfrecord/pascal_label_map.pbtxt"
  }

  data_augmentation_options {
    resize_image {
      new_height: 300
      new_width: 300
      method: BILINEAR
    }
  }

  data_augmentation_options {
    subtract_channel_mean {
      # see https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_pascal.py#L177
      means: 123.68
      means: 116.779
      means: 103.939
    } 
  }

}
export_config {
  batch_size: 1
}

ssd_mobilenet¶

yolo3¶

##-*- encoding:utf-8 -*-
# Author: honsgehng.jhs@alibaba-inc.com
# Date: 2020-06-01
# yolo3.config: encode the configs used in a yolo-v3 model

model_config {
  model_class: 'YOLO3'
  yolo {
    backbone {
      class_name: "darknet53"
    }
    yolo_head {
      num_classes: 20
      yolo_featuremap_layout {
        from_layer: 'conv4_res'
        from_layer: 'conv5_res'
        from_layer: 'conv6'
      }
      
      conv_hyperparams {
        activation: LEAKY_RELU
        regularizer {
          l2_regularizer {
            weight: 0.0005
          }
        }
        initializer {
          xavier_initializer {
            uniform : true
          }
        }
        batch_norm {
          decay: 0.997
          center: true
          scale: true
          epsilon: 1e-05
        }
      }

      anchor_generator {
        yolo_anchor_generator {
          anchor_group {
            anchor_size {
              width: 10
              height: 13
            }
            anchor_size {
              width: 16
              height: 30
            }
            anchor_size {
              width: 33
              height: 23
            }
          }
          anchor_group {
            anchor_size {
              width: 30
              height: 61
            }
            anchor_size {
              width: 62
              height: 45
            }
            anchor_size {
              width: 59
              height: 119
            }
          }
          anchor_group {
            anchor_size {
              width: 116
              height: 90
            }
            anchor_size {
              width: 156
              height: 198
            }
            anchor_size {
              width: 373
              height: 326
            }
          }
        }
      }

      box_predictor {
        yolo_box_predictor {
          conv_hyperparams {
            activation: LEAKY_RELU
            regularizer {
              l2_regularizer {
                weight: 0.0005
              }
            }
            initializer {
             xavier_initializer {
                uniform : true
              }
            }
            batch_norm {
              decay: 0.997
              center: true
              scale: true
              epsilon: 1e-05
            }
          }
        }
      }

      post_processing {
        batch_non_max_suppression {
          score_threshold: 0.01
          iou_threshold: 0.45
          max_detections_per_class: 200
          max_total_detections: 200
        }
        score_converter: SIGMOID
      }

      ignore_threshold: 0.5
    }
  }
}

train_config: {
  optimizer {
    momentum_optimizer: {
       learning_rate: {
         manual_step_learning_rate {
           initial_learning_rate: 0.0000001
           schedule {
             step: 1000
             learning_rate: 0.001
           }
           schedule {
             step: 40000
             learning_rate: 0.0001
           }
           schedule {
             step: 45000
             learning_rate: 0.00001
           }
           warmup: true
         }
       }
       momentum_optimizer_value: 0.9
    }
  }

  #gradient_clipping_by_norm : 10.0

  num_steps: 50200
  fine_tune_checkpoint: "pretrained_models/darknet53/model.ckpt"
  model_dir: 'experiments/yolo/output/yolo3_voc'
  summary_model_vars: false
  sync_replicas: false
  train_distribute: "mirrored"
  num_gpus_per_worker: 8
}

train_data: {
  input_path: "data/voc0712_tfrecord/voc0712_part_*.tfrecord"
  batch_size: 8 #16
  num_readers: 4
  shuffle: true
  read_block_length : 32
  bucket_sizes: 10
  voc_decoder_config {
    label_map_path: "data/voc0712_tfrecord/pascal_label_map.pbtxt"
  }

  data_augmentation_options {
    random_distort_color {
    }
  }

  data_augmentation_options {
    random_horizontal_flip {
    }
  }

  data_augmentation_options {
    random_pad_image {
      max_height_ratio: 1.6
      max_width_ratio: 1.6
    }
  }

  data_augmentation_options {
    random_crop_image {
      min_aspect_ratio: 0.25
      max_aspect_ratio: 4.0
      min_area: 0.1
      max_area: 1.0
    }
  }

  data_augmentation_options {
    normalize_image {
      original_minval: 0.0
      original_maxval: 255.0
      target_minval: 0.0
      target_maxval: 1.0
    }
  }

  data_augmentation_options {
    random_resize_image {
      new_heights: [320, 352, 384, 416, 448, 480, 512, 544, 576, 608]
      new_widths: [320, 352, 384, 416, 448, 480, 512, 544, 576, 608]
      method: BICUBIC
    }
  }
}

eval_config: {
  num_examples: 4952
  metrics_set :  'pascal_voc07_detection_metrics'
  visualize_groundtruth_boxes : true 
  # num of visualizations to be displayed on tensorboard
  num_visualizations : 32
  # all the evaluation results will be saved to this dir if not ''
  visualization_export_dir: ''
  max_num_boxes_to_visualize: 32
  min_score_threshold: 0.5
  matching_iou_threshold: 0.5
}

eval_data : {
  input_path: "data/voc0712_tfrecord/VOC2007_test.tfrecord"
  batch_size: 1
  shuffle: false
  num_readers: 1
  voc_decoder_config {
    label_map_path: "data/voc0712_tfrecord/pascal_label_map.pbtxt"
  }

  data_augmentation_options {
    normalize_image {
      original_minval: 0.0
      original_maxval: 255.0
      target_minval: 0.0
      target_maxval: 1.0
    }
  }

  data_augmentation_options {
    resize_image {
      new_height: 416
      new_width: 416
      method: BICUBIC
    }
  }

}
export_config {
  batch_size: 1
}

图像分割¶

deeplab_v3+_r101_stage1¶

#-*- encoding:utf-8 -*-
# Author: mengli.cml@alibaba-inc.com
# Date: 2018_06-22
# step1 of deeplab configuration

train_config: {
  optimizer {
    momentum_optimizer: {
      learning_rate: {
        poly_decay_learning_rate {
          learning_rate_base: 0.007
          total_steps: 30000
          power: 0.9
        }
      }
      momentum_optimizer_value: 0.9
    }
    use_moving_average: false
  }
  save_checkpoints_steps: 5000
  #fine_tune_checkpoint: "xception/model.ckpt"
  fine_tune_checkpoint: "pretrained_models/resnet_v1d_101/model.ckpt"
  num_steps: 30000
  model_dir: "pascal_deeplab_model"
}

train_data: {
  input_path: "data/pascal_voc_seg_aug/voc_ev_train.tfrecord"
  batch_size: 6
  num_readers: 4
  read_block_length: 1
  shuffle: true
  seg_decoder_config { }
  # note the augmentation order is important, so it cannot be changed
  data_augmentation_options {
    subtract_channel_mean {
      # see https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/fast_rcnn/config.py#L181
      means: 123.68
      means: 116.779
      means: 103.939
    }
  }

  data_augmentation_options {
    deeplab_random_crop {
      crop_size: 513
    }
  }

  data_augmentation_options {
    deeplab_random_horizontal_flip {
    }
  }
}
    
eval_config: {
#  num_examples: 100
  max_evals: 1000
  num_visualizations: 100
}

eval_data: {
  input_path: "data/pascal_voc_seg_aug/voc_ev_val.tfrecord"

  batch_size: 1
  shuffle: false
  num_readers: 1
  seg_decoder_config {
  } 

  # note the augmentation order is important, so it cannot be changed
  data_augmentation_options {
    subtract_channel_mean {
      # see https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/fast_rcnn/config.py#L181
      means: 123.68
      means: 116.779
      means: 103.939
    }
  }

  num_epochs: 1
}

model_config: {
  model_class: 'DeepLab'
  deeplab {
    backbone {
        # class_name: 'xception_65'
        class_name: 'resnet_v1d_101'
        batchnorm_trainable: true
        weight_decay: 0.0005
        output_stride: 16
    } 

    aspp_input_layer: 'resnet_v1d_101/block4'
   
    aspp_block {
      image_level_features: true
      batchnorm_trainable: true
      weight_decay: 1e-5
      feature_depth: 256
      atrous_rates: 6
      atrous_rates: 12
      atrous_rates: 18
      keep_prob: 0.9
    } 

    seg_decoder_head {
      weight_decay: 1e-5
      batchnorm_trainable: true
      # input_layer: 'xception_65/entry_flow/block2/unit_1/xception_module/separable_conv2_pointwise'
      input_layer: 'resnet_v1d_101/block1'
      decoder_depth: 256
      output_stride: 4 
      num_classes: 21
    }
  }
}

deeplab_v3+_r101_stage2¶

#-*- encoding:utf-8 -*-
# Author: mengli.cml@alibaba-inc.com
# Date: 2018_06-22
# step2 of deeplab configuration

train_config: {
  optimizer {
    momentum_optimizer: {
      learning_rate: {
        poly_decay_learning_rate {
          learning_rate_base: 0.0002
          total_steps: 30000
          power: 0.9
        }
      }
      momentum_optimizer_value: 0.9
    }
    use_moving_average: false
  }
  save_checkpoints_steps: 5000
  fine_tune_checkpoint: "pascal_deeplab_model/model.ckpt-30000"
  num_steps: 30000
  model_dir: "pascal_deeplab_model_finetune"
}

train_data: {
  input_path: "pascal_voc_seg/train-00000-of-00004.tfrecord"
  input_path: "pascal_voc_seg/train-00001-of-00004.tfrecord"
  input_path: "pascal_voc_seg/train-00002-of-00004.tfrecord"
  input_path: "pascal_voc_seg/train-00003-of-00004.tfrecord"
  batch_size: 2
  num_readers: 4
  read_block_length: 1
  shuffle: true
  seg_decoder_config { }
  # note the augmentation order is important, so it cannot be changed
  data_augmentation_options {
    subtract_channel_mean {
      # see https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/fast_rcnn/config.py#L181
      means: 123.68
      means: 116.779
      means: 103.939
    }
  }

  data_augmentation_options {
    deeplab_random_crop {
      crop_size: 513
    }
  }

  data_augmentation_options {
    deeplab_random_horizontal_flip {
    }
  }
}
    
eval_config: {
#  num_examples: 100
  max_evals: 1000
  num_visualizations: 100
}

eval_data: {
  input_path: "pascal_voc_seg/val-00000-of-00004.tfrecord"
  input_path: "pascal_voc_seg/val-00001-of-00004.tfrecord"
  input_path: "pascal_voc_seg/val-00002-of-00004.tfrecord"
  input_path: "pascal_voc_seg/val-00003-of-00004.tfrecord"

  batch_size: 1
  shuffle: false
  num_readers: 1
  seg_decoder_config {
  } 

  # note the augmentation order is important, so it cannot be changed
  data_augmentation_options {
    subtract_channel_mean {
      # see https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/fast_rcnn/config.py#L181
      means: 123.68
      means: 116.779
      means: 103.939
    }
  }

  num_epochs: 1
}

model_config: {
  model_class: 'DeepLab'
  deeplab {
    backbone {
        # class_name: 'xception_65'
        class_name: 'resnet_v1d_101'
        batchnorm_trainable: false
        weight_decay: 0.0005
        output_stride: 8
    } 

    aspp_input_layer: 'resnet_v1d_101/block4'
   
    aspp_block {
      image_level_features: true
      batchnorm_trainable: false
      weight_decay: 1e-5
      feature_depth: 256
      atrous_rates: 12
      atrous_rates: 24
      atrous_rates: 36
      keep_prob: 0.9
    } 

    seg_decoder_head {
      weight_decay: 1e-5
      batchnorm_trainable: false
      # input_layer: 'xception_65/entry_flow/block2/unit_1/xception_module/separable_conv2_pointwise'
      input_layer: 'resnet_v1d_101/block1'
      decoder_depth: 256
      output_stride: 4 
      num_classes: 21
    }
  }
}

实例分割¶

mask_rcnn_r50¶

#-*- encoding:utf-8 -*-
# Author: hongsheng.jhs@alibaba-inc.com
# Date: 2019-01-16
# mask_rcnn.config: mscoco mask rcnn model config

train_config: {
  optimizer {
    momentum_optimizer: {
      learning_rate: {
        manual_step_learning_rate {
          initial_learning_rate: 0.00002
          schedule {
            step: 100
            learning_rate: 0.001
          }
          schedule {
            step: 240000
            learning_rate: .0001
          }
          schedule {
            step: 320000
            learning_rate: .00001
          }
          warmup: true
        }
      }
      momentum_optimizer_value: 0.9
    }
    use_moving_average: false
  }
  fine_tune_checkpoint: "pretrained_models/resnet_v1d_50/model.ckpt"
  num_steps: 360000
  model_dir: "experiments/coco_resnet50_maskrcnn_model"
}

train_data: {
  input_path: "data/coco_wmask/coco_train_*.tfrecord"
  batch_size: 1
  num_readers: 8
  read_block_length: 1
  shuffle: true
  shuffle_buffer_size: 512
  prefetch_size: 256
  voc_decoder_config {
    label_map_path: "data/coco_wmask/mscoco_label_map.pbtxt"
    load_instance_masks: true
    mask_format: PNG_MASK_FORMAT
  }
  # note the augmentation order is important, so it cannot be changed
  data_augmentation_options { 
    random_horizontal_flip {
    }
  }

  data_augmentation_options {
    random_resize_to_range {
      min_sizes: [640, 672, 704, 736, 768, 800]
      max_sizes: [1333, 1333, 1333, 1333, 1333, 1333]
    }
  }

  data_augmentation_options {
    subtract_channel_mean {
      # see https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/fast_rcnn/config.py#L181
      means: 123.68
      means: 116.779
      means: 103.939
    }
  }
  use_diff: false
}
    
eval_config: {
  num_examples: 5000
  num_visualizations: 16
  metrics_set: 'coco_detection_metrics'
  metrics_set: 'coco_mask_metrics'
  visualize_groundtruth_boxes: true
}

eval_data: {
  input_path: "data/coco_wmask/coco_val.tfrecord"
  batch_size: 1
  shuffle: false
  prefetch_size: 256
  num_readers: 1
  voc_decoder_config {
    label_map_path: "data/mscoco/mscoco_label_map.pbtxt"
    load_instance_masks: true
    mask_format: PNG_MASK_FORMAT
  }
  # note the augmentation order is important, so it cannot be changed
  data_augmentation_options {
    random_resize_to_range {
      min_sizes: 800
      max_sizes: 1333
    }
  }
  data_augmentation_options {
    subtract_channel_mean {
      # see https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/fast_rcnn/config.py#L181
      means: 123.68
      means: 116.779
      means: 103.939
    }
  }
  use_diff: true
}

export_config {
  batch_size: 1
}

model_config: {
  model_class: 'FasterRcnn'
  faster_rcnn {
    backbone {
      class_name: 'resnet_v1d_50'
      batchnorm_trainable: false
      weight_decay: 0.0001
      output_stride: 16
    }
    rpn_head {
      input_layer: 'resnet_v1d_50/block3'
      box_predictor {
        convolutional_box_predictor {
          conv_hyperparams {
            op: CONV
            regularizer {
              l2_regularizer {
                weight: 0.0001
              }
            }
            initializer {
              truncated_normal_initializer {
                stddev: 0.01
              }
            }
          }
          min_depth: 512
          max_depth: 512
          num_layers_before_predictor: 1
          kernel_size: 3
        }
      }
      first_stage_minibatch_size: 256
      first_stage_positive_balance_fraction: 0.5
      first_stage_nms_iou_threshold: 0.7
      first_stage_max_proposals: 2000
      rpn_min_size: 0
      first_stage_anchor_generator {
        # the default base anchor size is 256
        grid_anchor_generator {
          scales: [0.125, 0.25, 0.5, 1.0, 2.0]
          aspect_ratios: [0.5, 1.0, 2.0]
          height_stride: 16
          width_stride: 16
        }
      }
    }
    region_feature_extractor {
      resnet_block {
        class_name: 'resnet_v1d_50' #the name of backbone
        block_name: 'block4'  #the last residual block of resnet_v1d_50
        stride: 1
        weight_decay: 0.0001
      }
    }
    rcnn_head {
      input_layer: 'resnet_v1d_50/block3'
      initial_crop_size: 14
      maxpool_kernel_size: 2
      maxpool_stride: 2
      num_classes: 90
      second_stage_box_predictor {
        mask_rcnn_box_predictor {
          fc_hyperparams {
            op: FC
            regularizer {
              l2_regularizer {
                weight: 0.0001
              }
            }
            initializer {
              xavier_initializer {
              }
            }
          }
         agnostic: true
       }
      }
      nms_config {
        score_threshold: 0.05
        iou_threshold: 0.5
        max_detections_per_class: 100
        max_total_detections: 100
      }

      second_stage_batch_size: 512
      second_stage_balance_fraction: 0.25
    }
    mrcnn_head {
      input_layer: 'resnet_v1d_50/block3'
      initial_crop_size: 14
      maxpool_kernel_size: 2
      maxpool_stride: 2
      num_classes: 90
      third_stage_mask_predictor {
        mask_rcnn_mask_predictor {
          conv_hyperparams {
            op: CONV
            regularizer {
              l2_regularizer {
                weight: 0.0001
              }
            }
            initializer {
              variance_scaling_initializer {
              }
            }
          }
          mask_height: 14
          mask_width: 14
          mask_prediction_conv_depth: 256
          mask_prediction_num_conv_layers: 1
          convolve_then_upsample_masks: true
        }
      }
    } 
  }
}
   

mask_rcnn_r50_fpn¶

#-*- encoding:utf-8 -*-
# Author: hongsheng.jhs@alibaba-inc.com
# Date: 2019-01-16
# mask_rcnn_fpn.config: coco_wmask mask rcnn fpn model config

train_config: {
  optimizer {
    momentum_optimizer: {
      learning_rate: {
        manual_step_learning_rate {
          initial_learning_rate: 0.00002
          schedule {
            step: 100
            learning_rate: 0.001
          }
          schedule {
            step: 240000
            learning_rate: .0001
          }
          schedule {
            step: 320000
            learning_rate: .00001
          }
          warmup: true
        }
      }
      momentum_optimizer_value: 0.9
    }
    use_moving_average: false
  }
  fine_tune_checkpoint: "pretrained_models/resnet_v1d_50/model.ckpt"
  num_steps: 360000
  model_dir: "experiments/coco_resnet50_maskrcnn_model_fpn"
}

train_data: {
  input_path: "data/coco_wmask/coco_train_*.tfrecord"
  batch_size: 1
  num_readers: 8
  read_block_length: 1
  shuffle: true
  shuffle_buffer_size: 512
  prefetch_size: 256
  voc_decoder_config {
    label_map_path: "data/coco_wmask/mscoco_label_map.pbtxt"
    load_instance_masks: true
    mask_format: PNG_MASK_FORMAT
  }
  # note the augmentation order is important, so it cannot be changed
  data_augmentation_options { 
    random_horizontal_flip {
    }
  }

  data_augmentation_options {
    random_resize_to_range {
      min_sizes: [640, 672, 704, 736, 768, 800]
      max_sizes: [1333, 1333, 1333, 1333, 1333, 1333]
    }
  }

  data_augmentation_options {
    subtract_channel_mean {
      # see https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/fast_rcnn/config.py#L181
      means: 123.68
      means: 116.779
      means: 103.939
    }
  }
  use_diff: false
}
    
eval_config: {
  num_examples: 5000
  num_visualizations: 16
  metrics_set: 'coco_detection_metrics'
  metrics_set: 'coco_mask_metrics'
  visualize_groundtruth_boxes: true
}

eval_data: {
  input_path: "data/coco_wmask/coco_val.tfrecord"
  batch_size: 1
  shuffle: false
  prefetch_size: 256
  num_readers: 1
  voc_decoder_config {
    label_map_path: "data/coco_wmask/mscoco_label_map.pbtxt"
    load_instance_masks: true
    mask_format: PNG_MASK_FORMAT
  }
  # note the augmentation order is important, so it cannot be changed
  data_augmentation_options {
    random_resize_to_range {
      min_sizes: 800 
      max_sizes: 1333
    }
  }
  data_augmentation_options {
    subtract_channel_mean {
      # see https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/fast_rcnn/config.py#L181
      means: 123.68
      means: 116.779
      means: 103.939
    }
  }
  use_diff: true
}

export_config {
  batch_size: 1
}

model_config: {
  model_class: 'FasterRcnn'
  faster_rcnn {
    backbone {
      class_name: 'resnet_v1d_50'
      batchnorm_trainable: false
      weight_decay: 0.0001
    }
    fpn {
      input: 'resnet_v1d_50/block1'
      input: 'resnet_v1d_50/block2'
      input: 'resnet_v1d_50/block3'
      input: 'resnet_v1d_50/block4'
      fea_dim: 256
      extra_conv_layers: 1
      roi_min_level: 2
      roi_max_level: 5
      conv_hyperparams {
        op: CONV
        regularizer {
          l2_regularizer {
            weight: 0.0001
          }
        }
        initializer {
          truncated_normal_initializer {
            stddev: 0.01
          }
        }
      }
    } 
    rpn_head {
      # if input_layer is not specified, will use fpn features,
      # which all have "FPN/" prefix
      box_predictor {
        weight_shared_convolutional_box_predictor {
          conv_hyperparams {
            op: CONV
            regularizer {
              l2_regularizer {
                weight: 0.0001
              }
            }
            initializer {
              truncated_normal_initializer {
                stddev: 0.01
              }
            }
          }
          depth: 256
          num_layers_before_predictor: 1
          kernel_size: 3
        }
      }   
      first_stage_minibatch_size: 256
      first_stage_positive_balance_fraction: 0.5
      first_stage_nms_iou_threshold: 0.7
      first_stage_max_proposals: 2000
      rpn_min_size: 16
      first_stage_anchor_generator {
        # anchor_size = anchor_scale * feature_map_stride
        multiscale_anchor_generator {
          min_level: 2
          max_level: 6
          anchor_scale: 8
          aspect_ratios: 0.5
          aspect_ratios: 1
          aspect_ratios: 2
          normalize_coordinates: false
          scales_per_octave: 1
        }
      }
    }
    rcnn_head {
      initial_crop_size: 14
      maxpool_kernel_size: 2 
      maxpool_stride: 2
      num_classes: 90 
      second_stage_box_predictor {
        mask_rcnn_box_predictor {
          num_layers_before_predictor: 2
          depth: 1024
          fc_hyperparams {
            op: FC
            regularizer {
              l2_regularizer {
                weight: 0.0001
              }
            }
            initializer {
              xavier_initializer {
              }
            }
          }
         agnostic: true
       }
      }
      nms_config {
        score_threshold: 0.05
        iou_threshold: 0.5
        max_detections_per_class: 100
        max_total_detections: 100
      }
  
      second_stage_batch_size: 512
      second_stage_balance_fraction: 0.25
    } 
    mrcnn_head {
      initial_crop_size: 28
      maxpool_kernel_size: 2 
      maxpool_stride: 2
      num_classes: 90 
      third_stage_mask_predictor {
        mask_rcnn_mask_predictor {
          conv_hyperparams {
            op: CONV
            regularizer {
              l2_regularizer {
                weight: 0.0001
              }
            }
            initializer {
              variance_scaling_initializer {
              }
            }
          }
          mask_height: 28
          mask_width: 28
          mask_prediction_conv_depth: 256
          mask_prediction_num_conv_layers: 5
          convolve_then_upsample_masks: true
        }
      }
    } 
  }
}
   

文字检测¶

text_krcnn_r50_fpn¶

#-*- encoding:utf-8 -*-
# Author: hongsheng.jhs@alibaba-inc.com
# Date: 2019-04-27
# text_krcnn_resnet50.config:
#   icdar text krcnn model training config

train_config: {
  optimizer {
    adam_optimizer: {
      learning_rate: {
        exponential_decay_learning_rate {
          initial_learning_rate: 0.00001
          decay_steps: 150000
          decay_factor: 0.5
	      min_learning_rate: 0.0000001
        }
      }
    }
    use_moving_average: false
  }
  # gradient_clipping_by_norm: 0.0
  fine_tune_checkpoint: "pretrained_models/resnet_v1d_50/model.ckpt"
  num_steps: 400000
  model_dir: "experiments/icdar_ch4/text_krcnn_resnet50_fpn"
  save_checkpoints_steps: 2000
  save_summary_steps: 100
  log_step_count_steps: 100
  summary_model_vars: false
}

train_data: {
  input_path: "data/icdar_detection_tfrecords/icdar_training_*.tfrecord"
  batch_size: 1
  shuffle: true
  shuffle_buffer_size: 64
  prefetch_size: 64
  num_readers: 8
  text_detection_decoder_config {
    label_map_path: "data/icdar_detection_tfrecords/label_map.pbtxt"
  }

  data_augmentation_options {
    random_jitter_aspect_ratio {
      min_jitter_coef: 0.8
      max_jitter_coef: 1.2
    }
  }

  data_augmentation_options {
    random_rotation {
      min_angle: -10
      max_angle: 10
      use_keypoints_calc_boxes: true
    }
  }
  
  data_augmentation_options {
    random_resize_to_range {
      min_sizes: 640
      max_sizes: 2000
      min_sizes: 800
      max_sizes: 2000
      min_sizes: 960
      max_sizes: 2000
      min_sizes: 1120
      max_sizes: 2000
    }
  }
  
  data_augmentation_options {
    random_distort_color {
    }
  }

  data_augmentation_options {
    subtract_channel_mean {
      # see https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/fast_rcnn/config.py#L181
      means: 123.68
      means: 116.779
      means: 103.939
    }
  }
  use_diff: false
}
    
eval_config: {
  num_examples: 500
  num_visualizations : 16
  metrics_set: "icdar_detection_metrics"
  visualization_export_dir: ''
}

eval_data: {
  input_path: "data/icdar_detection_tfrecords/icdar-ch4-test.tfrecord"
  batch_size: 1
  shuffle: false
  prefetch_size: 32
  text_detection_decoder_config {
    label_map_path: "data/icdar_detection_tfrecords/label_map.pbtxt"
  }
  # note the augmentation order is important, so it cannot be changed
  data_augmentation_options {
    random_resize_to_range {
      min_sizes: 960
      max_sizes: 2000
    }
  }
  data_augmentation_options {
    subtract_channel_mean {
      # see https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/fast_rcnn/config.py#L181
      means: 123.68
      means: 116.779
      means: 103.939
    }
  }
  use_diff: true
}

export_config {
  batch_size: 1
}

model_config: {
  model_class: 'TextKRCNN'
  text_krcnn {
    backbone {
        class_name: 'resnet_v1d_50'
        batchnorm_trainable: false
        weight_decay: 0.0001
    } 
    fpn {
      input: 'resnet_v1d_50/block1'
      input: 'resnet_v1d_50/block2'
      input: 'resnet_v1d_50/block3'
      input: 'resnet_v1d_50/block4'
      fea_dim: 256
      extra_conv_layers: 1
      roi_min_level: 2
      roi_max_level: 5
      roi_canonical_scale: 168
      roi_canonical_level: 4
      conv_hyperparams {
        op: CONV
        regularizer {
          l2_regularizer {
            weight: 0.0001
          }
        }
        initializer {
          truncated_normal_initializer {
            stddev: 0.01
          }
        }
      }
    }
    rpn_head {
      # if input_layer is not specified, will use fpn features,
      # which all have "FPN/" prefix
      box_predictor {
        weight_shared_convolutional_box_predictor {
          conv_hyperparams {
            op: CONV
            regularizer {
              l2_regularizer {
                weight: 0.0001
              }
            }
            initializer {
              truncated_normal_initializer {
                stddev: 0.01
              }
            }
          }
          depth: 256
          num_layers_before_predictor: 1
          kernel_size: 3
        }
      }

      first_stage_minibatch_size: 256
      first_stage_positive_balance_fraction: 0.5
      first_stage_nms_iou_threshold: 0.7
      first_stage_max_proposals: 300
      rpn_min_size: 8
      first_stage_anchor_generator {
        multiscale_anchor_generator {
          min_level: 2
          max_level: 6
          anchor_scale: 6
          aspect_ratios: [0.2, 0.5, 1, 2, 5]
          normalize_coordinates: false
          scales_per_octave: 1
        }
      }
    }
    rcnn_head {
      initial_crop_size: 14
      maxpool_kernel_size: 2
      maxpool_stride: 2
      num_classes: 1

      second_stage_box_predictor {
        mask_rcnn_box_predictor {
          num_layers_before_predictor: 2
          depth: 1024
          fc_hyperparams {
            op: FC
            regularizer {
              l2_regularizer {
                weight: 0.0001
              }
            }
            initializer {
              truncated_normal_initializer {
                stddev: 0.01
              }
            }
          }
          agnostic: true
        }
      }

      hard_example_miner {
        num_hard_examples: 128
        iou_threshold: 0.99
        loss_type: BOTH
      }

      nms_config {
        score_threshold: 0.7
        iou_threshold: 0.3
        max_detections_per_class: 400
        max_total_detections: 400
      }

      second_stage_batch_size: 128
      second_stage_balance_fraction: 0.25
    }

    keypoint_head {
      keypoint_predictor {
        text_resnet_keypoint_predictor {
          conv_hyperparams {
            op: CONV
            regularizer {
              l2_regularizer {
                weight: 0.0001
              }
            }
            initializer {
              variance_scaling_initializer {
              }
            }
          }
          fc_hyperparams {
            op: FC
            regularizer {
              l2_regularizer {
                 weight: 0.0001
              }
            }
            initializer {
              variance_scaling_initializer {
              }
            }
          }
        }
      }
      initial_crop_size: 28
      maxpool_kernel_size: 2
      maxpool_stride: 2
      num_keypoints: 4
      predict_direction: false
      direction_trainable: false
    }
  }
}
   

文字识别¶

crnn_ctc_r15¶

##-*- encoding:utf-8 -*-
# Author: hongsheng.jhs@alibaba-inc.com
# Date: 2019-02-28
# text crnn ctc config for recipt text

train_config: {
  optimizer {
    adam_optimizer: {
      learning_rate: {
        exponential_decay_learning_rate {
          initial_learning_rate: 0.0001
          decay_steps: 30000
          decay_factor: 0.7
        }
      }
    }
    use_moving_average: false
  }
  # gradient_clipping_by_norm : 10.0
  num_steps: 1000000
  model_dir: 'experiments/recipt_text/crnn_ctc_resnet15_fixed_height_wopretrain'
  save_checkpoints_steps: 2000
  save_summary_steps: 100
  log_step_count_steps: 100
  summary_model_vars: false
  # for distributed training only
  # sync_replicas: false
  # replicas_to_aggregate: 8
  # num_worker_replicas: 8
}

train_data: {
  input_path: "data/recipt_text/recognition_tfrecords/train_*.tfrecord"
  batch_size: 64
  shuffle: true
  num_readers: 8
  text_recognition_decoder_config {
    char_dict_path: "data/recipt_text/recognition_tfrecords/char_dict"
    min_input_ratio: 0.125
    max_input_ratio: 38
    num_buckets: 10
  }

  data_augmentation_options {
    resize_image_with_fixed_height {
      new_height: 32
    }
  }

  data_augmentation_options {
    random_distort_color {
    }
  }

  data_augmentation_options {
    rgb_to_gray {
    }
  }

  data_augmentation_options {
    normalize_image {
      original_minval: 0
      original_maxval: 255
      target_minval: 0
      target_maxval: 1
    }
  }
}

eval_config: {
  num_visualizations : 16
}

eval_data: {
  input_path: "data/recipt_text/recognition_tfrecords/test.tfrecord"
  batch_size: 64
  shuffle: false
  text_recognition_decoder_config {
    char_dict_path: "data/recipt_text/recognition_tfrecords/char_dict"
    min_input_ratio: 0.125
    max_input_ratio: 100
    num_buckets: 10
  }

  data_augmentation_options {
    resize_image_with_fixed_height {
      new_height: 32
    }
  }

  data_augmentation_options {
    rgb_to_gray {
    }
  }

  data_augmentation_options {
    normalize_image {
      original_minval: 0
      original_maxval: 255
      target_minval: 0
      target_maxval: 1
    }
  }
  num_epochs: 1
}

export_config {
  batch_size: -1
}

model_config {
  model_class: 'TextRecognition'
  text_recognition {
    backbone {
      class_name: 'text_resnet15'
    }
    ctc_head {
      input_layer: 'text_resnet15/conv5_0'
      crnn_encoder {
        num_layers: 2
        basic_lstm {
          num_units: 512
        }
        encoder_type: UNI
      }
      ctc_decoder {
      }
    }
  }
}

crnn_attention_r15¶

##-*- encoding:utf-8 -*-
# Author: hongsheng.jhs@alibaba-inc.com
# Date: 2019-03-20
# text recognition config for recipt text

train_config: {
  optimizer {
    adam_optimizer: {
      learning_rate: {
        exponential_decay_learning_rate {
          initial_learning_rate: 0.0001
          decay_steps: 30000
          decay_factor: 0.7
        }
      }
    }
    use_moving_average: false
  }
  # gradient_clipping_by_norm : 10.0
  num_steps: 1000000
  fine_tune_checkpoint: "pretrained_models/general_crnn_attn_resnet15/model.ckpt"
  model_dir: 'experiments/recipt_text/crnn_attn_resnet15_fixed_height'
  save_checkpoints_steps: 2000
  save_summary_steps: 100
  log_step_count_steps: 100
  summary_model_vars: false
  # for distributed training only
  # sync_replicas: false
  # replicas_to_aggregate: 8
  # num_worker_replicas: 8
}

train_data: {
  input_path: "data/recipt_text/recognition_tfrecords/train_*.tfrecord"
  batch_size: 64
  shuffle: true
  num_readers: 8
  text_recognition_decoder_config {
    char_dict_path: "data/recipt_text/recognition_tfrecords/char_dict"
    min_input_ratio: 0.125
    max_input_ratio: 38
    num_buckets: 10
  }

  data_augmentation_options {
    resize_image_with_fixed_height {
      new_height: 32
    }
  }

  data_augmentation_options {
    random_distort_color {
    }
  }

  data_augmentation_options {
    rgb_to_gray {
    }
  }

  data_augmentation_options {
    normalize_image {
      original_minval: 0
      original_maxval: 255
      target_minval: 0
      target_maxval: 1
    }
  }
}

eval_config: {
  num_visualizations : 16
}

eval_data: {
  input_path: "data/recipt_text/recognition_tfrecords/test.tfrecord"
  batch_size: 64
  shuffle: false
  text_recognition_decoder_config {
    char_dict_path: "data/recipt_text/recognition_tfrecords/char_dict"
    min_input_ratio: 0.125
    max_input_ratio: 100
    num_buckets: 10
  }

  data_augmentation_options {
    resize_image_with_fixed_height {
      new_height: 32
    }
  }

  data_augmentation_options {
    rgb_to_gray {
    }
  }

  data_augmentation_options {
    normalize_image {
      original_minval: 0
      original_maxval: 255
      target_minval: 0
      target_maxval: 1
    }
  }
  num_epochs: 1
}

export_config {
  batch_size: -1
}

model_config {
  model_class: 'TextRecognition'
  text_recognition {
    backbone {
      class_name: 'text_resnet15'
    }
    attention_head {
      input_layer: 'text_resnet15/conv5_0'
      crnn_encoder {
        num_layers: 2
        basic_lstm {
          num_units: 512
        }
        encoder_type: UNI
      }
      attention_decoder {
        embedding_size: 64
        num_layers: 2
        basic_lstm {
          num_units: 512
        }
        attention_mechanism: "normed_bahdanau"
        # visualize_type: "line"
      }
    }
  }
}

crnn_mono_attention_r15¶

##-*- encoding:utf-8 -*-
# Author: hongsheng.jhs@alibaba-inc.com
# Date: 2020-06-15
# text recognition config for recipt text

train_config: {
  optimizer {
    adam_optimizer: {
      learning_rate: {
        exponential_decay_learning_rate {
          initial_learning_rate: 0.0001
          decay_steps: 30000
          decay_factor: 0.7
        }
      }
    }
    use_moving_average: false
  }
  # gradient_clipping_by_norm : 10.0
  num_steps: 1000000
  fine_tune_checkpoint: "pretrained_models/general_crnn_mono_norm_attn_resnet15_xxlarge/model.ckpt"
  model_dir: 'experiments/recipt_text/crnn_mono_norm_attn_resnet15_fixed_height_xxlarge_dict'
  save_checkpoints_steps: 2000
  save_summary_steps: 100
  log_step_count_steps: 100
  summary_model_vars: false
  # for distributed training only
  # sync_replicas: false
  # replicas_to_aggregate: 8
  # num_worker_replicas: 8
}

train_data: {
  input_path: "data/recipt_text/recognition_tfrecords/train_*.tfrecord"
  batch_size: 64
  shuffle: true
  num_readers: 8
  text_recognition_decoder_config {
    char_dict_path: "data/recipt_text/recognition_tfrecords/char_dict_xxlarge"
    min_input_ratio: 0.125
    max_input_ratio: 38
    num_buckets: 10
  }

  data_augmentation_options {
    resize_image_with_fixed_height {
      new_height: 32
    }
  }

  data_augmentation_options {
    random_distort_color {
    }
  }

  data_augmentation_options {
    random_rgb_to_gray {
      probability: 0.2
    }
  }

  data_augmentation_options {
    subtract_channel_mean {
      means: 123.68
      means: 116.779
      means: 103.939
    }
  }
}

eval_config: {
  num_visualizations : 16
}

eval_data: {
  input_path: "data/recipt_text/recognition_tfrecords/test.tfrecord"
  batch_size: 64
  shuffle: false
  text_recognition_decoder_config {
    char_dict_path: "data/recipt_text/recognition_tfrecords/char_dict_xxlarge"
    min_input_ratio: 0.125
    max_input_ratio: 100
    num_buckets: 10
  }

  data_augmentation_options {
    resize_image_with_fixed_height {
      new_height: 32
    }
  }

  data_augmentation_options {
    subtract_channel_mean {
      means: 123.68
      means: 116.779
      means: 103.939
    }
  }
  num_epochs: 1
}

export_config {
  batch_size: -1
}

model_config {
  model_class: 'TextRecognition'
  text_recognition {
    backbone {
      class_name: 'text_resnet15'
      batchnorm_trainable: true
      weight_decay: 0.00001
    }
    attention_head {
      input_layer: 'text_resnet15/conv5_0'
      crnn_encoder {
        num_layers: 2
        layer_norm_basic_lstm {
          num_units: 512
        }
        encoder_type: UNI
      }
      attention_decoder {
        embedding_size: 256
        num_layers: 2
        layer_norm_basic_lstm {
          num_units: 512
        }
        attention_mechanism: "monotonic_normed_bahdanau"
        # visualize_type: "line"
      }
    }
  }
}

cnn_spatial_attention_r15¶

##-*- encoding:utf-8 -*-
# Author: hongsheng.jhs@alibaba-inc.com
# Date: 2020-04-24
# text cnn spatial attention config for recipt text

train_config: {
  optimizer {
    adam_optimizer: {
      learning_rate: {
        exponential_decay_learning_rate {
          initial_learning_rate: 0.0001
          decay_steps: 30000
          decay_factor: 0.7
        }
      }
    }
    use_moving_average: false
  }
  # gradient_clipping_by_norm : 10.0
  num_steps: 1000000
  model_dir: 'experiments/recipt_text/cnn_spatial_attn_resnet15_fixed_height_wopretrain'
  save_checkpoints_steps: 2000
  save_summary_steps: 100
  log_step_count_steps: 100
  summary_model_vars: false
  # for distributed training only
  # sync_replicas: false
  # replicas_to_aggregate: 8
  # num_worker_replicas: 8
}

train_data: {
  input_path: "data/recipt_text/recognition_tfrecords/train_*.tfrecord"
  batch_size: 24
  shuffle: true
  num_readers: 8
  text_recognition_decoder_config {
    char_dict_path: "data/recipt_text/recognition_tfrecords/char_dict"
    min_input_ratio: 0.125
    max_input_ratio: 38
    num_buckets: 10
  }

  data_augmentation_options {
    resize_image_with_fixed_height {
      new_height: 64
    }
  }

  data_augmentation_options {
    random_distort_color {
    }
  }

  data_augmentation_options {
    rgb_to_gray {
    }
  }

  data_augmentation_options {
    normalize_image {
      original_minval: 0
      original_maxval: 255
      target_minval: 0
      target_maxval: 1
    }
  }
}

eval_config: {
  num_visualizations : 16
}

eval_data: {
  input_path: "data/recipt_text/recognition_tfrecords/test.tfrecord"
  batch_size: 24
  shuffle: false
  text_recognition_decoder_config {
    char_dict_path: "data/recipt_text/recognition_tfrecords/char_dict"
    min_input_ratio: 0.125
    max_input_ratio: 100
    num_buckets: 10
  }

  data_augmentation_options {
    resize_image_with_fixed_height {
      new_height: 64
    }
  }

  data_augmentation_options {
    rgb_to_gray {
    }
  }

  data_augmentation_options {
    normalize_image {
      original_minval: 0
      original_maxval: 255
      target_minval: 0
      target_maxval: 1
    }
  }
  num_epochs: 1
}

export_config {
  batch_size: -1
}

model_config {
  model_class: 'TextRecognition'
  text_recognition {
    backbone {
      class_name: 'text_resnet15'
    }
    attention_head {
      input_layer: 'text_resnet15/conv5_0'
      cnn_spatial_encoder {
      }
      attention_decoder {
        embedding_size: 64
        num_layers: 2
        basic_lstm {
          num_units: 512
        }
        attention_mechanism: "normed_bahdanau"
        pass_hidden_state: false
        visualize_type: "spatial"
      }
    }
  }
}

transformer_ocr¶

#-*- encoding:utf-8 -*-
# Author: hongsheng.jhs@alibaba-inc.com
# Date: 2019-09-19
# transformer_f1024_e12d4.config:
#   Synth90k text recognition model (Transformer) training config

train_config: {
  optimizer {
    adam_optimizer: {
      learning_rate: {
        transformer_learning_rate {
          learning_rate_base: 2
          hidden_size: 512
          warmup_steps: 8000
        }
      }
    }
    use_moving_average: false
  }
  # gradient_clipping_by_norm: 10.0
  num_steps: 1000000
  model_dir: "experiments/synth90k/output/transformer_f1024_e12d4"
  save_checkpoints_steps: 2000
  save_summary_steps: 100
  log_step_count_steps: 100
  summary_model_vars: false
}

train_data: {
  input_path: "data/Synth90k_tfrecords/Synth90k_train_*.tfrecord"
  batch_size: 512
  shuffle: true
  num_readers: 8
  text_recognition_decoder_config {
    char_dict_path: "data/Synth90k_tfrecords/char_dict"
  }

  data_augmentation_options {
    resize_image {
      new_height: 32
      new_width: 100
    }
  }

  data_augmentation_options {
    random_distort_color {
    }
  }

  data_augmentation_options {
    rgb_to_gray {
    }
  }

  data_augmentation_options {
    normalize_image {
      original_minval: 0
      original_maxval: 255
      target_minval: -1
      target_maxval: 1
    }
  }
}

eval_config: {
  num_examples: 31232
  num_visualizations : 16
}

eval_data: {
  input_path: "data/Synth90k_tfrecords/Synth90k_test.tfrecord"
  batch_size: 512
  shuffle: false
  text_recognition_decoder_config {
    char_dict_path: "data/Synth90k_tfrecords/char_dict"
  }

  data_augmentation_options {
    resize_image {
      new_height: 32
      new_width: 100
    }
  }

  data_augmentation_options {
    rgb_to_gray {
    }
  }

  data_augmentation_options {
    normalize_image {
      original_minval: 0
      original_maxval: 255
      target_minval: -1
      target_maxval: 1
    }
  }
}

export_config {
  batch_size: -1
}

model_config {
  model_class: 'TextRecognition'
  text_recognition {
    transformer_head {
      input_layer: 'image'
      transformer_encoder {
        num_layers: 12
        hidden_size: 512
        num_heads: 8
        filter_size: 1024
        layer_postprocess_dropout: 0.1
        attention_dropout: 0.1
        relu_dropout: 0.1
        pooling_rate: 4
      }
      transformer_decoder {
        num_layers: 4
        hidden_size: 512
        num_heads: 8
        filter_size: 1024
        layer_postprocess_dropout: 0.1
        attention_dropout: 0.1
        relu_dropout: 0.1
      }
    }
  }
}

端到端的文字识别¶

text_end2end_krcnn_attention¶

#-*- encoding:utf-8 -*-
# Author: hongsheng.jhs@alibaba-inc.com
# Date: 2019-03-12
# text_end2end_krcnn_attn_dis.config:
#   receipt text text end2end model training config

train_config: {
  optimizer {
    adam_optimizer: {
      learning_rate: {
        exponential_decay_learning_rate {
          initial_learning_rate: 0.0001
          decay_steps: 50000
          decay_factor: 0.5
	      min_learning_rate: 0.000001
        }
      }
    }
    use_moving_average: false
  }
  # gradient_clipping_by_norm: 0.0
  fine_tune_checkpoint: "pretrained_models/general_text_end2end_krcnn_attn_resnet50/model.ckpt"
  num_steps: 1000000
  model_dir: "experiments/recipt_text/text_end2end_krcnn_resnet50_attn"
  save_checkpoints_steps: 2000
  save_summary_steps: 100
  log_step_count_steps: 100
  summary_model_vars: false
  # for distributed training only
  # sync_replicas: false
  # replicas_to_aggregate: 8
  # num_worker_replicas: 8
}

train_data: {
  input_path: "data/recipt_text/end2end_tfrecords/train_*.tfrecord"
  batch_size: 1
  shuffle: true
  shuffle_buffer_size: 64
  prefetch_size: 64
  num_readers: 8
  text_end2end_decoder_config {
    char_dict_path: "data/recipt_text/end2end_tfrecords/char_dict"
    label_map_path: "data/recipt_text/end2end_tfrecords/label_map.pbtxt"
  }

  data_augmentation_options {
    random_jitter_aspect_ratio {
      min_jitter_coef: 0.8
      max_jitter_coef: 1.2
    }
  }

  data_augmentation_options {
    random_rotation90 {
    }
  }

  data_augmentation_options {
    random_rotation {
      min_angle: -10
      max_angle: 10
      use_keypoints_calc_boxes: true
    }
  }

  data_augmentation_options {
    random_resize_to_range {
      min_sizes: 640
      max_sizes: 1440
      min_sizes: 800
      max_sizes: 1440
      min_sizes: 960
      max_sizes: 1440
    }
  }
  
  data_augmentation_options {
    random_distort_color {
    }
  }

  data_augmentation_options {
    subtract_channel_mean {
      # see https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/fast_rcnn/config.py#L181
      means: 123.68
      means: 116.779
      means: 103.939
    }
  }
  use_diff: false
}
    
eval_config: {
  num_examples: 299
  num_visualizations : 16
  visualization_export_dir: ''
  metrics_set: "icdar_end2end_metrics"
}

eval_data: {
  input_path: "data/recipt_text/end2end_tfrecords/test.tfrecord"
  batch_size: 1
  shuffle: false
  prefetch_size: 32
  text_end2end_decoder_config {
    char_dict_path: "data/recipt_text/end2end_tfrecords/char_dict"
    label_map_path: "data/recipt_text/end2end_tfrecords/label_map.pbtxt"
  }
  # note the augmentation order is important, so it cannot be changed
  data_augmentation_options {
    random_resize_to_range {
      min_sizes: 800
      max_sizes: 1440
    }
  }
  data_augmentation_options {
    subtract_channel_mean {
      # see https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/fast_rcnn/config.py#L181
      means: 123.68
      means: 116.779
      means: 103.939
    }
  }
  use_diff: true
}

export_config {
  batch_size: 1
}

model_config: {
  model_class: 'TextEnd2End'
  text_end2end {
    backbone {
        class_name: 'resnet_v1_50'
        batchnorm_trainable: false
        weight_decay: 0.0001
    } 
    fpn {
      input: 'resnet_v1_50/block1'
      input: 'resnet_v1_50/block2'
      input: 'resnet_v1_50/block3'
      input: 'resnet_v1_50/block4'
      fea_dim: 256
      extra_conv_layers: 1
      roi_min_level: 2
      roi_max_level: 5
      roi_canonical_scale: 168
      roi_canonical_level: 4
      conv_hyperparams {
        op: CONV
        regularizer {
          l2_regularizer {
            weight: 0.0001
          }
        }
        initializer {
          truncated_normal_initializer {
            stddev: 0.01
          }
        }
      }
    }
    rpn_head {
      # if input_layer is not specified, will use fpn features,
      # which all have "FPN/" prefix
      box_predictor {
        weight_shared_convolutional_box_predictor {
          conv_hyperparams {
            op: CONV
            regularizer {
              l2_regularizer {
                weight: 0.0001
              }
            }
            initializer {
              truncated_normal_initializer {
                stddev: 0.01
              }
            }
          }
          depth: 256
          num_layers_before_predictor: 1
          kernel_size: 3
        }
      }

      first_stage_minibatch_size: 256
      first_stage_positive_balance_fraction: 0.5
      first_stage_nms_iou_threshold: 0.7
      first_stage_max_proposals: 300
      rpn_min_size: 8
      first_stage_anchor_generator {
        multiscale_anchor_generator {
          min_level: 2
          max_level: 6
          anchor_scale: 6
          aspect_ratios: [0.2, 0.5, 1, 2, 5]
          normalize_coordinates: false
          scales_per_octave: 1
        }
      }
    }
    rcnn_head {
      initial_crop_size: 14
      maxpool_kernel_size: 2
      maxpool_stride: 2
      num_classes: 1

      second_stage_box_predictor {
        mask_rcnn_box_predictor {
          num_layers_before_predictor: 2
          depth: 1024
          fc_hyperparams {
            op: FC
            regularizer {
              l2_regularizer {
                weight: 0.0001
              }
            }
            initializer {
              truncated_normal_initializer {
                stddev: 0.01
              }
            }
          }
          agnostic: true
        }
      }

      hard_example_miner {
        num_hard_examples: 128
        iou_threshold: 0.99
        loss_type: BOTH
      }

      nms_config {
        score_threshold: 0.7
        iou_threshold: 0.3
        max_detections_per_class: 400
        max_total_detections: 400
      }

      second_stage_batch_size: 128
      second_stage_balance_fraction: 0.25
    }

    keypoint_head {
      keypoint_predictor {
        text_resnet_keypoint_predictor {
          conv_hyperparams {
            op: CONV
            regularizer {
              l2_regularizer {
                weight: 0.0001
              }
            }
            initializer {
              variance_scaling_initializer {
              }
            }
          }
          fc_hyperparams {
            op: FC
            regularizer {
              l2_regularizer {
                 weight: 0.0001
              }
            }
            initializer {
              variance_scaling_initializer {
              }
            }
          }
        }
      }
      initial_crop_size: 28
      maxpool_kernel_size: 2
      maxpool_stride: 2
      num_keypoints: 4
      predict_direction: true
      direction_trainable: true
      unified_direction: true
    }

    fixed_height_feature_gather {
      input_layer: 'FPN/level_1'
      height: 8
      max_width: 300
      visualize_height: 32
      visualize_width: 100
      num_buckets: 5
    }

    attention_head {
      crnn_encoder {
        cnn_name: 'senet5_encoder'
        norm_type: GROUP
        weight_decay: 0.0
        num_layers: 2
        basic_lstm {
          num_units: 512
        }
        encoder_type: UNI
      }
      attention_decoder {
        embedding_size: 64
        num_layers: 2
        basic_lstm {
          num_units: 512
        }
        attention_mechanism: "normed_bahdanau"
        # visualize_type: "line"
      }
    }
  }
}
   

SavedModel评估¶

detector¶

# -*- encoding:utf-8 -*-
# Author: hongsheng.jhs@alibaba-inc.com
# Date: 2019-05-06
# detector evaluation config

predictor_name: "Detector"

model_path: "data/test/inference/rfcn"

eval_data: {
  input_path: "data/voc0712_tfrecord/VOC2007_test.tfrecord"
  batch_size: 1
  shuffle: false
  prefetch_size: 32
  voc_decoder_config {
    label_map_path: "data/voc0712_tfrecord/pascal_label_map.pbtxt"
  }
  # do not need augmentation
  use_diff: true
  num_epochs: 1
}

eval_config: {
  num_examples: 10
  metrics_set: 'coco_detection_metrics'
  metrics_set: 'pascal_voc_detection_metrics'
  metrics_set: 'pascal_voc07_detection_metrics'
}

classifier¶

# -*- encoding:utf-8 -*-
# Author: hongsheng.jhs@alibaba-inc.com
# Date: 2019-04-29
# text pipeline predictor evaluation config

predictor_name: "Classifier"

model_path: "data/test/inference/cifar10_resnet50"

eval_data : {
  input_path: "data/cifar10/cifar10_test.tfrecord"
  batch_size: 100
  shuffle: false
  num_readers: 1
  drop_remainder: false
  classification_decoder_config{
    label_map_path: 'data/cifar10/labelmap.pbtxt'
  }
}

eval_config: {
  num_visualizations : 16
  visualization_export_dir: ''
  metrics_set: "classification_metrics"
}

multilabel_classifier¶

# -*- encoding:utf-8 -*-
# Author: wenmeng.zwm@alibaba-inc.com
# Date: 2019-09-17
# text pipeline predictor evaluation config

predictor_name: "MultiLabelClassifier"

model_path: "data/test/inference/objects365_resnet101"

eval_data : {
  input_path: "data/objects365_tfrecord/objects365_test*.tfrecord"
  batch_size: 16
  shuffle: false
  num_readers: 2
  classification_decoder_config {
    label_map_path: "data/objects365_tfrecord/objects365_label_map.pbtxt"
     is_multi_label: true
  }
}

eval_config: {
  metrics_set: "multi_label_classification_metrics"
  include_metrics_per_category: true
}

text_detector¶

# -*- encoding:utf-8 -*-
# Author: wenmeng.zwm@alibaba-inc.com
# Date: 2019-09-17
# text pipeline predictor evaluation config

predictor_name: "MultiLabelClassifier"

model_path: "data/test/inference/objects365_resnet101"

eval_data : {
  input_path: "data/objects365_tfrecord/objects365_test*.tfrecord"
  batch_size: 16
  shuffle: false
  num_readers: 2
  classification_decoder_config {
    label_map_path: "data/objects365_tfrecord/objects365_label_map.pbtxt"
     is_multi_label: true
  }
}

eval_config: {
  metrics_set: "multi_label_classification_metrics"
  include_metrics_per_category: true
}

text_detector¶

# -*- encoding:utf-8 -*-
# Author: hongsheng.jhs@alibaba-inc.com
# Date: 2019-04-29
# text detector evaluation config

predictor_name: "TextDetector"

model_path: "data/test/inference/text_krcnn"

eval_data: {
  input_path: "data/icdar_detection_tfrecords/icdar-ch4-test.tfrecord"
  batch_size: 1
  shuffle: false
  prefetch_size: 32
  text_detection_decoder_config {
    label_map_path: "data/icdar_detection_tfrecords/label_map.pbtxt"
  }
  # do not need augmentation
  use_diff: true
  num_epochs: 1
}

eval_config: {
  metrics_set: "icdar_detection_metrics"
}

text_recognizer¶

# -*- encoding:utf-8 -*-
# Author: hongsheng.jhs@alibaba-inc.com
# Date: 2019-04-29
# text pipeline predictor evaluation config

predictor_name: "TextRecognizer"

model_path: "data/test/inference/crnn_attn"

eval_data: {
  input_path: "data/recipt_text/recognition_tfrecords/test.tfrecord"
  batch_size: 64
  shuffle: false
  text_recognition_decoder_config {
    char_dict_path: "data/recipt_text/recognition_tfrecords/char_dict"
    min_input_ratio: 0.125
    max_input_ratio: 100
  }
  # do not need augmentation
  num_epochs: 1
}

eval_config: {
  metrics_set: "text_recognition_metrics"
}

text_spotter¶

# -*- encoding:utf-8 -*-
# Author: hongsheng.jhs@alibaba-inc.com
# Date: 2019-04-29
# text pipeline predictor evaluation config

predictor_name: "TextSpotter"

model_path: "data/test/inference/text_end2end"

eval_data: {
  input_path: "data/recipt_text/end2end_tfrecords/test.tfrecord"
  batch_size: 1
  shuffle: false
  prefetch_size: 32
  text_end2end_decoder_config {
    char_dict_path: "data/recipt_text/end2end_tfrecords/char_dict"
    label_map_path: "data/recipt_text/end2end_tfrecords/label_map.pbtxt"
  }
  # do not need augmentation
  use_diff: true
  num_epochs: 1
}

eval_config: {
  num_visualizations : 16
  visualization_export_dir: ''
  metrics_set: "icdar_end2end_metrics"
}

text_pipeline_predictor¶

# -*- encoding:utf-8 -*-
# Author: hongsheng.jhs@alibaba-inc.com
# Date: 2019-04-29
# text pipeline predictor evaluation config

predictor_name: "TextPipelinePredictor"

model_path: "data/test/inference/text_pipeline"

eval_data: {
  input_path: "data/recipt_text/end2end_tfrecords/test.tfrecord"
  batch_size: 1
  shuffle: false
  prefetch_size: 32
  text_end2end_decoder_config {
    char_dict_path: "data/recipt_text/end2end_tfrecords/char_dict"
    label_map_path: "data/recipt_text/end2end_tfrecords/label_map.pbtxt"
  }
  # do not need augmentation
  use_diff: true
  num_epochs: 1
}

eval_config: {
  metrics_set: "icdar_end2end_metrics"
}