SharePoint 2013 Content Enrichment: Regular Expression Data Extraction

There are a number of posts on getting started writing a Content Enrichment Web Service. I found https://msdn.microsoft.com/en-us/library/office/jj163982.aspx a great starting point. I recently needed to write a CEWS application to extract regex patterns from a managed property. I thought I would share this as template.

 

Define InProperty to be the name of the managed property you would like to extract from and OutProperty to be the Managed property that will hold the extracted data.

 

using System;

using System.Collections.Generic;

using System.IO;

using Microsoft.Office.Server.Search.ContentProcessingEnrichment;

using Microsoft.Office.Server.Search.ContentProcessingEnrichment.PropertyTypes;

using System.Text.RegularExpressions;

 

namespace RegExContentProcessingEnrichmentService

{

 

public class RegExContentProcessingEnrichmentService : IContentProcessingEnrichmentService

    {

 // Defines the error code for managed properties with an unexpected type. 

privateconstint UnexpectedType = 1;      

// Defines the error code for encountering unexpected exceptions.

privateconstint UnexpectedError = 2;

 

// out is the managed property we will write the extracted data to 

private const string OutProperty = "out";

// in contains the text we will match our pattern against

private const string InProperty = "in";

       

private readonly ProcessedItem processedItemHolder = new ProcessedItem

        {

            ItemProperties = new List<AbstractProperty>()

        };

       

 public ProcessedItem ProcessItem(Item item)

        {

            processedItemHolder.ErrorCode = 0;

            processedItemHolder.ItemProperties.Clear();

         

        try        {

          // placeholder for output.

          // We have defined the Managed Property as multi-valued, so we want to output a list

          Property<List<String>> output =new Property<List<String>>();

                 // store the input string

                String input=null;

          

               // loop through each property

                  // https://blogs.msdn.com/b/richard_dizeregas_blog/archive/2013/06/19/advanced-content-enrichment-in-sharepoint-2013-search.aspx

                  // presents a more elegant method of selecting the desired properties

         foreach (var property in item.ItemProperties)

                {

            // Check if this is the input property.

            if (property.Name.Equals(InProperty, StringComparison.Ordinal))

                    {

                        // I had some issues getting the value via the example method. This seems to work fine

                        input= (String)property.ObjectValue;

                    }

                }

         if (input!= null)

                {

                 // regex to find in the input

                 string pattern = "Value=\"([^\"]*)\"";

                 MatchCollection matches = Regex.Matches(input, pattern, RegexOptions.Singleline);

                // initialize the output

                output.Value = new List<String>();

                output.Name = OutProperty;

               foreach (Match match in matches)

                    {

                        // add the value of the matching group to the output managed property

                        output.Value.Add(match.Groups[1].Value);

                    }

                    processedItemHolder.ItemProperties.Add(output);

                }

        }

   catch (Exception)

            {

                processedItemHolder.ErrorCode = UnexpectedError;

            }

 

return processedItemHolder;

        }

    }

}