Browse Source

pattern表达式调整
pk缓存

mcy 6 years ago
parent
commit
706ee88914

+ 8 - 0
protocol/pom.xml

@@ -26,5 +26,13 @@
 			<groupId>commons-lang</groupId>
 			<artifactId>commons-lang</artifactId>
 		</dependency>
+		<dependency>
+			<groupId>oro</groupId>
+			<artifactId>oro</artifactId>
+		</dependency>
+		<dependency>
+			<groupId>com.googlecode.aviator</groupId>
+			<artifactId>aviator</artifactId>
+		</dependency>
 	</dependencies>
 </project>

+ 47 - 31
protocol/src/main/java/com/alibaba/otter/canal/protocol/FlatMessage.java

@@ -2,8 +2,10 @@ package com.alibaba.otter.canal.protocol;
 
 import java.io.Serializable;
 import java.util.*;
-import java.util.regex.Pattern;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
 
+import com.alibaba.otter.canal.protocol.aviater.AviaterRegexFilter;
 import org.apache.commons.lang.StringUtils;
 
 import com.google.protobuf.ByteString;
@@ -14,24 +16,27 @@ import com.google.protobuf.ByteString;
  */
 public class FlatMessage implements Serializable {
 
-    private static final long          serialVersionUID = -3386650678735860050L;
+    private static final long                                    serialVersionUID = -3386650678735860050L;
 
-    private long                       id;
-    private String                     database;
-    private String                     table;
-    private Boolean                    isDdl;
-    private String                     type;
+    private static ConcurrentMap<String, String>                 schemaTabPk      = new ConcurrentHashMap<>();
+    private static ConcurrentHashMap<String, AviaterRegexFilter> regexFilters     = new ConcurrentHashMap<>();
+
+    private long                                                 id;
+    private String                                               database;
+    private String                                               table;
+    private Boolean                                              isDdl;
+    private String                                               type;
     // binlog executeTime
-    private Long                       es;
+    private Long                                                 es;
     // dml build timeStamp
-    private Long                       ts;
-    private String                     sql;
-    private Map<String, Integer>       sqlType;
-    private Map<String, String>        mysqlType;
-    private List<Map<String, String>>  data;
-    private List<Map<String, String>>  old;
+    private Long                                                 ts;
+    private String                                               sql;
+    private Map<String, Integer>                                 sqlType;
+    private Map<String, String>                                  mysqlType;
+    private List<Map<String, String>>                            data;
+    private List<Map<String, String>>                            old;
 
-    private transient CanalEntry.Entry entry;                                   // 所属entry
+    private transient CanalEntry.Entry                           entry;                                       // 所属entry
 
     public FlatMessage(long id){
         this.id = id;
@@ -143,7 +148,7 @@ public class FlatMessage implements Serializable {
 
     /**
      * 将Message转换为FlatMessage
-     * 
+     *
      * @param message 原生message
      * @return FlatMessage列表
      */
@@ -272,7 +277,7 @@ public class FlatMessage implements Serializable {
 
     /**
      * 将FlatMessage按指定的字段值hash拆分
-     * 
+     *
      * @param flatMessage flatMessage
      * @param partitionsNum 分区数量
      * @param pkHashConfigs hash映射
@@ -302,7 +307,14 @@ public class FlatMessage implements Serializable {
                         pk = pkHashConfig.substring(i + 1);
                     }
                     pkHashConfig = pkHashConfig.substring(0, i);
-                    isMatch = Pattern.matches(pkHashConfig, database + "." + table);
+
+                    AviaterRegexFilter aviaterRegexFilter = regexFilters.get(pkHashConfig);
+                    if (aviaterRegexFilter == null) {
+                        aviaterRegexFilter = new AviaterRegexFilter(pkHashConfig);
+                        regexFilters.putIfAbsent(pkHashConfig, aviaterRegexFilter);
+                    }
+
+                    isMatch = aviaterRegexFilter.filter(database + "." + table);
                     if (isMatch) {
                         break;
                     }
@@ -313,19 +325,23 @@ public class FlatMessage implements Serializable {
                     partitionMessages[0] = flatMessage;
                 } else {
                     if (pk == null) {
-                        // 如果未指定主键(通配符主键),从原生message中取主键字段
-                        CanalEntry.Entry entry = flatMessage.getEntry();
-                        CanalEntry.RowChange rowChange;
-                        try {
-                            rowChange = CanalEntry.RowChange.parseFrom(entry.getStoreValue());
-                        } catch (Exception e) {
-                            throw new RuntimeException(e.getMessage(), e);
-                        }
-                        CanalEntry.RowData rowData = rowChange.getRowDatasList().get(0);
-                        for (CanalEntry.Column column : rowData.getAfterColumnsList()) {
-                            if (column.getIsKey()) {
-                                pk = column.getName();
-                                break;
+                        pk = schemaTabPk.get(database + "." + table);
+                        if (pk == null) {
+                            // 如果未指定主键(通配符主键),从原生message中取主键字段
+                            CanalEntry.Entry entry = flatMessage.getEntry();
+                            CanalEntry.RowChange rowChange;
+                            try {
+                                rowChange = CanalEntry.RowChange.parseFrom(entry.getStoreValue());
+                            } catch (Exception e) {
+                                throw new RuntimeException(e.getMessage(), e);
+                            }
+                            CanalEntry.RowData rowData = rowChange.getRowDatasList().get(0);
+                            for (CanalEntry.Column column : rowData.getAfterColumnsList()) {
+                                if (column.getIsKey()) {
+                                    pk = column.getName();
+                                    schemaTabPk.putIfAbsent(database + "." + table, pk);
+                                    break;
+                                }
                             }
                         }
                     }

+ 35 - 19
protocol/src/main/java/com/alibaba/otter/canal/protocol/Message.java

@@ -3,15 +3,17 @@ package com.alibaba.otter.canal.protocol;
 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.List;
-import java.util.regex.Pattern;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
 
-import com.google.protobuf.InvalidProtocolBufferException;
+import com.alibaba.otter.canal.protocol.aviater.AviaterRegexFilter;
 import org.apache.commons.lang.StringUtils;
 import org.apache.commons.lang.builder.ToStringBuilder;
 
 import com.alibaba.otter.canal.common.utils.CanalToStringStyle;
 import com.alibaba.otter.canal.protocol.CanalEntry.Entry;
 import com.google.protobuf.ByteString;
+import com.google.protobuf.InvalidProtocolBufferException;
 
 /**
  * @author zebin.xuzb @ 2012-6-19
@@ -19,14 +21,17 @@ import com.google.protobuf.ByteString;
  */
 public class Message implements Serializable {
 
-    private static final long      serialVersionUID = 1234034768477580009L;
+    private static final long                                    serialVersionUID = 1234034768477580009L;
+
+    private static ConcurrentMap<String, String>                 schemaTabPk      = new ConcurrentHashMap<>();
+    private static ConcurrentHashMap<String, AviaterRegexFilter> regexFilters     = new ConcurrentHashMap<>();
 
-    private long                   id;
-    private List<CanalEntry.Entry> entries          = new ArrayList<CanalEntry.Entry>();
+    private long                                                 id;
+    private List<CanalEntry.Entry>                               entries          = new ArrayList<CanalEntry.Entry>();
     // row data for performance, see:
     // https://github.com/alibaba/canal/issues/726
-    private boolean                raw              = true;
-    private List<ByteString>       rawEntries       = new ArrayList<ByteString>();
+    private boolean                                              raw              = true;
+    private List<ByteString>                                     rawEntries       = new ArrayList<ByteString>();
 
     public Message(long id, List<Entry> entries){
         this.id = id;
@@ -152,7 +157,14 @@ public class Message implements Serializable {
                             pk = pkHashConfig.substring(i + 1);
                         }
                         pkHashConfig = pkHashConfig.substring(0, i);
-                        isMatch = Pattern.matches(pkHashConfig, database + "." + table);
+
+                        AviaterRegexFilter aviaterRegexFilter = regexFilters.get(pkHashConfig);
+                        if (aviaterRegexFilter == null) {
+                            aviaterRegexFilter = new AviaterRegexFilter(pkHashConfig);
+                            regexFilters.putIfAbsent(pkHashConfig, aviaterRegexFilter);
+                        }
+
+                        isMatch = aviaterRegexFilter.filter(database + "." + table);
                         if (isMatch) {
                             break;
                         }
@@ -163,17 +175,21 @@ public class Message implements Serializable {
                         partitionEntries[0].add(entry);
                     } else {
                         if (pk == null) {
-                            // 如果未指定主键(通配符主键),取主键字段
-                            try {
-                                rowChange = CanalEntry.RowChange.parseFrom(entry.getStoreValue());
-                            } catch (Exception e) {
-                                throw new RuntimeException(e.getMessage(), e);
-                            }
-                            CanalEntry.RowData rowData = rowChange.getRowDatasList().get(0);
-                            for (CanalEntry.Column column : rowData.getAfterColumnsList()) {
-                                if (column.getIsKey()) {
-                                    pk = column.getName();
-                                    break;
+                            pk = schemaTabPk.get(database + "." + table);
+                            if (pk == null) {
+                                // 如果未指定主键(通配符主键),取主键字段
+                                try {
+                                    rowChange = CanalEntry.RowChange.parseFrom(entry.getStoreValue());
+                                } catch (Exception e) {
+                                    throw new RuntimeException(e.getMessage(), e);
+                                }
+                                CanalEntry.RowData rowData = rowChange.getRowDatasList().get(0);
+                                for (CanalEntry.Column column : rowData.getAfterColumnsList()) {
+                                    if (column.getIsKey()) {
+                                        pk = column.getName();
+                                        schemaTabPk.putIfAbsent(database + "." + table, pk);
+                                        break;
+                                    }
                                 }
                             }
                         }

+ 124 - 0
protocol/src/main/java/com/alibaba/otter/canal/protocol/aviater/AviaterRegexFilter.java

@@ -0,0 +1,124 @@
+package com.alibaba.otter.canal.protocol.aviater;
+
+import java.util.*;
+
+import org.apache.commons.lang.StringUtils;
+
+import com.googlecode.aviator.AviatorEvaluator;
+import com.googlecode.aviator.Expression;
+
+/**
+ * 基于aviater进行tableName正则匹配的过滤算法
+ *
+ * @author jianghang 2012-7-20 下午06:01:34
+ */
+public class AviaterRegexFilter {
+
+    private static final String             SPLIT             = ",";
+    private static final String             PATTERN_SPLIT     = "|";
+    private static final String             FILTER_EXPRESSION = "regex(pattern,target)";
+    private static final RegexFunction regexFunction     = new RegexFunction();
+    private final Expression                exp               = AviatorEvaluator.compile(FILTER_EXPRESSION, true);
+    static {
+        AviatorEvaluator.addFunction(regexFunction);
+    }
+
+    private static final Comparator<String> COMPARATOR        = new StringComparator();
+
+    final private String                    pattern;
+    final private boolean                   defaultEmptyValue;
+
+    public AviaterRegexFilter(String pattern){
+        this(pattern, true);
+    }
+
+    public AviaterRegexFilter(String pattern, boolean defaultEmptyValue){
+        this.defaultEmptyValue = defaultEmptyValue;
+        List<String> list = null;
+        if (StringUtils.isEmpty(pattern)) {
+            list = new ArrayList<String>();
+        } else {
+            String[] ss = StringUtils.split(pattern, SPLIT);
+            list = Arrays.asList(ss);
+        }
+
+        // 对pattern按照从长到短的排序
+        // 因为 foo|foot 匹配 foot 会出错,原因是 foot 匹配了 foo 之后,会返回 foo,但是 foo 的长度和 foot
+        // 的长度不一样
+        Collections.sort(list, COMPARATOR);
+        // 对pattern进行头尾完全匹配
+        list = completionPattern(list);
+        this.pattern = StringUtils.join(list, PATTERN_SPLIT);
+    }
+
+    public boolean filter(String filtered)  {
+        if (StringUtils.isEmpty(pattern)) {
+            return defaultEmptyValue;
+        }
+
+        if (StringUtils.isEmpty(filtered)) {
+            return defaultEmptyValue;
+        }
+
+        Map<String, Object> env = new HashMap<String, Object>();
+        env.put("pattern", pattern);
+        env.put("target", filtered.toLowerCase());
+        return (Boolean) exp.execute(env);
+    }
+
+    /**
+     * 修复正则表达式匹配的问题,因为使用了 oro 的 matches,会出现:
+     *
+     * <pre>
+     * foo|foot 匹配 foot 出错,原因是 foot 匹配了 foo 之后,会返回 foo,但是 foo 的长度和 foot 的长度不一样
+     * </pre>
+     *
+     * 因此此类对正则表达式进行了从长到短的排序
+     *
+     * @author zebin.xuzb 2012-10-22 下午2:02:26
+     * @version 1.0.0
+     */
+    private static class StringComparator implements Comparator<String> {
+
+        @Override
+        public int compare(String str1, String str2) {
+            if (str1.length() > str2.length()) {
+                return -1;
+            } else if (str1.length() < str2.length()) {
+                return 1;
+            } else {
+                return 0;
+            }
+        }
+    }
+
+    /**
+     * 修复正则表达式匹配的问题,即使按照长度递减排序,还是会出现以下问题:
+     *
+     * <pre>
+     * foooo|f.*t 匹配 fooooot 出错,原因是 fooooot 匹配了 foooo 之后,会将 fooo 和数据进行匹配,但是 foooo 的长度和 fooooot 的长度不一样
+     * </pre>
+     *
+     * 因此此类对正则表达式进行头尾完全匹配
+     *
+     * @author simon
+     * @version 1.0.0
+     */
+
+    private List<String> completionPattern(List<String> patterns) {
+        List<String> result = new ArrayList<String>();
+        for (String pattern : patterns) {
+            StringBuffer stringBuffer = new StringBuffer();
+            stringBuffer.append("^");
+            stringBuffer.append(pattern);
+            stringBuffer.append("$");
+            result.add(stringBuffer.toString());
+        }
+        return result;
+    }
+
+    @Override
+    public String toString() {
+        return pattern;
+    }
+}

+ 38 - 0
protocol/src/main/java/com/alibaba/otter/canal/protocol/aviater/PatternUtils.java

@@ -0,0 +1,38 @@
+package com.alibaba.otter.canal.protocol.aviater;
+
+import java.util.Map;
+
+import org.apache.oro.text.regex.MalformedPatternException;
+import org.apache.oro.text.regex.Pattern;
+import org.apache.oro.text.regex.PatternCompiler;
+import org.apache.oro.text.regex.Perl5Compiler;
+
+import com.google.common.base.Function;
+import com.google.common.collect.MapMaker;
+import com.google.common.collect.MigrateMap;
+
+public class PatternUtils {
+    @SuppressWarnings("deprecation")
+    private static Map<String, Pattern> patterns = MigrateMap.makeComputingMap(new MapMaker().softValues(),
+        new Function<String, Pattern>() {
+
+            public Pattern apply(String pattern) {
+                try {
+                    PatternCompiler pc = new Perl5Compiler();
+                    return pc.compile(pattern,
+                        Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+                                               | Perl5Compiler.SINGLELINE_MASK);
+                } catch (MalformedPatternException e) {
+                    throw new RuntimeException(e);
+                }
+            }
+        });
+
+    public static Pattern getPattern(String pattern) {
+        return patterns.get(pattern);
+    }
+
+    public static void clear() {
+        patterns.clear();
+    }
+}

+ 31 - 0
protocol/src/main/java/com/alibaba/otter/canal/protocol/aviater/RegexFunction.java

@@ -0,0 +1,31 @@
+package com.alibaba.otter.canal.protocol.aviater;
+
+import java.util.Map;
+
+import org.apache.oro.text.regex.Perl5Matcher;
+
+import com.googlecode.aviator.runtime.function.AbstractFunction;
+import com.googlecode.aviator.runtime.function.FunctionUtils;
+import com.googlecode.aviator.runtime.type.AviatorBoolean;
+import com.googlecode.aviator.runtime.type.AviatorObject;
+
+/**
+ * 提供aviator regex的代码扩展
+ *
+ * @author jianghang 2012-7-23 上午10:29:23
+ */
+public class RegexFunction extends AbstractFunction {
+
+    public AviatorObject call(Map<String, Object> env, AviatorObject arg1, AviatorObject arg2) {
+        String pattern = FunctionUtils.getStringValue(arg1, env);
+        String text = FunctionUtils.getStringValue(arg2, env);
+        Perl5Matcher matcher = new Perl5Matcher();
+        boolean isMatch = matcher.matches(text, PatternUtils.getPattern(pattern));
+        return AviatorBoolean.valueOf(isMatch);
+    }
+
+    public String getName() {
+        return "regex";
+    }
+
+}