//import com.aliasi.sentences.MedlineSentenceModel; //import com.aliasi.sentences.SentenceChunker; //import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory; import com.semgine.common.logging.IdHolderLoggerDelegate; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.text.BreakIterator; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.EnumMap; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.ListIterator; import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.StringTokenizer; import java.util.Map.Entry; import java.util.logging.Logger; /** * This is a simple overview and not the correct MapReduce programm. */ public class PrimerMapper extends MapReduceBase implements Mapper { static final char[] END_PUNCTUATION = new char[] {'.', ',', '!', '?'}; static final char[] START_PUNCTUATION = new char[] {'\u00a1', '\u00bf'}; static final char[] OPEN_CHARS = new char[] {'"', '\'', '\u00ab', '\u201c', '\u2018', '(', '[', '<', '{'}; static final char[] CLOSE_CHARS = new char[] {'"', '\'', '\u00bb', '\u201d', '\u2019', ')', ']', '>', '}'}; public static final String DELIMITER = "\t"; public enum Parts { ID, YEAR, CONTENT, CREATOR} public enum Index { MAP, NOT_THREE_PARTS, UNKNOWN_YEAR, NOT_ENGLISH } public PrimerMapper() { ; } final Map toSentences(final String text) { final Map sentenceMap = new HashMap(); try { final MessageDigest md = MessageDigest.getInstance("MD5"); // final MessageDigest md = MessageDigest.getInstance("SHA-256"); // final MessageDigest md = MessageDigest.getInstance("SHA-512"); final Set sentences = breakIntoSentences(text); for (final String sentence : sentences) { final String normalized = normalizeSentence(sentence); final String lowerCased = normalized.toLowerCase(Locale.ENGLISH); try { final byte[] bytes = lowerCased.getBytes("UTF-8"); final byte[] key = md.digest(bytes); sentenceMap.put(key, lowerCased); } catch (final UnsupportedEncodingException e) { e.printStackTrace(); } } } catch (final NoSuchAlgorithmException e) { e.printStackTrace(); } return sentenceMap; } // private final SentenceChunker chunker = new SentenceChunker(new IndoEuropeanTokenizerFactory(), // new MedlineSentenceModel()); final Set breakIntoSentences(final String text) { assert text != null; final Set sentences = new HashSet(); // final Set chunks = this.chunker.chunk(text).chunkSet(); // for (final Chunk chunk : chunks) { // final int start = chunk.start(); // final int end = chunk.end(); // final String cs = text.substring(start, end); // sentences.add(cs); // } final BreakIterator boundary = BreakIterator.getSentenceInstance(Locale.ENGLISH); boundary.setText(text); int start = boundary.first(); for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { final String sentence = text.substring(start, end).trim(); sentences.add(sentence); } return sentences; } final String normalizeSentence(final String sentence) { assert sentence != null; final List words = new ArrayList(); for (final StringTokenizer st = new StringTokenizer(sentence); st.hasMoreElements(); ) { final String s = st.nextToken(); words.add(s); } correctSentenceEnd(words); final int termCount = words.size(); final String[] terms = words.toArray(new String[termCount]); if (terms.length != 0) { terms[0] = correctFirst(terms[0]); } for (int i = 0; i < terms.length; i++) { terms[i] = correctSuroundingChars(terms[i]); } final StringBuilder sb = new StringBuilder(); for (final String term : terms) { sb.append(term); sb.append(" "); } sb.append("."); final String retval = sb.toString(); return retval; } final String correctSuroundingChars(final String term) { if (term == null) { throw new IllegalArgumentException("term is null"); } if (term.length() == 0) { return ""; } final char[] chars = term.toCharArray(); int start = 0; int length = chars.length; for (int i = 0; i < (chars.length - 1); i++) { boolean found = false; for (int j = 0; j < OPEN_CHARS.length; j++) { if (chars[i] == OPEN_CHARS[j]) { if (chars[chars.length - 1] == CLOSE_CHARS[j]) { if ((start + 1) >= length || (length - 2) <= 0) { return ""; } final String s = new String(chars, 1, length - 2); return correctSuroundingChars(s); } final int open = countCharInTerm(chars, OPEN_CHARS[j]); final char[] second = new char[chars.length - 1]; System.arraycopy(chars, 1, second, 0, chars.length - 1); final int close = countCharInTerm(second, CLOSE_CHARS[j]); if (open != close) { final String s = new String(chars, 1, length - 1); return correctSuroundingChars(s); } } } } for (int i = chars.length - 1 ; i >= 0; i--) { boolean found = false; for (int j = 0; j < CLOSE_CHARS.length; j++) { if (chars[i] == CLOSE_CHARS[j]) { final String s = new String(chars, 0, chars.length - 1); return correctSuroundingChars(s); } } if (!found) { break; } } return term; } final String correctLast(final String first) { if (first == null) { throw new IllegalArgumentException("first is null"); } final char[] chars = first.toCharArray(); int length = chars.length; for (int i = chars.length -1 ; i >= 0; i--) { boolean found = false; for (int j = 0; j < END_PUNCTUATION.length; j++) { if (chars[i] == END_PUNCTUATION[j]) { length--; found = true; break; } } if (!found) { break; } } return new String(chars, 0, length); } final String correctFirst(final String first) { if (first == null) { throw new IllegalArgumentException("first is null"); } final char[] chars = first.toCharArray(); int start = 0; int length = chars.length; for (int i = 0; i < chars.length; i++) { boolean found = false; for (int j = 0; j < START_PUNCTUATION.length; j++) { if (chars[i] == START_PUNCTUATION[j]) { start++; length--; found = true; break; } } if (!found) { break; } } return new String(chars, start, length); } final void correctSentenceEnd(final List words) { Collections.reverse(words); for (final ListIterator itr = words.listIterator() ; itr.hasNext() ;) { final String word = itr.next(); final String corrected = correctLast(word); if (corrected.length() == 0) { itr.remove(); } else { itr.set(corrected); break; } } Collections.reverse(words); } final int countCharInTerm(final char[] chars, final char c) { if (chars == null) { throw new IllegalArgumentException("chars is null"); } int count = 0; for (int i = 0; i < chars.length; i++) { if (chars[i] == c) { count++; } } return count; } }