Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…f-0b23-4da4-9fe8-d0f6fe080806
  • Loading branch information
macchiati committed Oct 11, 2018
1 parent 0a03c64 commit a3a7054
Show file tree
Hide file tree
Showing 11 changed files with 462 additions and 139 deletions.
59 changes: 46 additions & 13 deletions unicodetools/org/unicode/tools/emoji/CandidateData.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
import com.ibm.icu.text.DateFormat;
import com.ibm.icu.text.Transform;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSet.SpanCondition;
import com.ibm.icu.text.UnicodeSetSpanner;
import com.ibm.icu.util.ICUException;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.VersionInfo;
Expand Down Expand Up @@ -150,7 +152,7 @@ private CandidateData(String sourceFile) {
if (line.startsWith("#") || line.isEmpty()) { // comment
continue;
} else if (line.startsWith("U+")) { // data
fixGenderSkin(source); // old source
fixGenderSkin(source); // fix old source. we do it here so we know the properties

source = Utility.fromHex(line);
if (allCharacters.contains(source)) {
Expand Down Expand Up @@ -318,6 +320,7 @@ private CandidateData(String sourceFile) {
textPresentation.freeze();

emoji_Modifier_Base.freeze();

emoji_Gender_Base.freeze();
takesSign.freeze();
emoji_Component.freeze();
Expand Down Expand Up @@ -392,34 +395,52 @@ private void fixGenderSkin(String source) {
if (source == null) {
return;
}
int single = UnicodeSet.getSingleCodePoint(source);
if (single == Integer.MAX_VALUE) {
return;
if (source.equals("👩‍🦯️")) {
int debug = 0;
}
boolean isModBase = emoji_Modifier_Base.contains(source);
if (isModBase) {


boolean hasModifierBase = emoji_Modifier_Base.containsSome(source)
|| EmojiData.EMOJI_DATA_BETA.getModifierBases().containsSome(source);
if (hasModifierBase) {
// find the point where it occurs; not efficient but we don't care
UnicodeSet all_Emoji_Modifier_Base = new UnicodeSet(emoji_Modifier_Base)
.addAll(EmojiData.EMOJI_DATA_BETA.getModifierBases())
.freeze();

int start = all_Emoji_Modifier_Base.span(source, SpanCondition.NOT_CONTAINED);
int end = all_Emoji_Modifier_Base.span(source, start, SpanCondition.CONTAINED);

String prefix = source.substring(0, end);
String postfix = source.substring(end);
for (String mod : EmojiData.MODIFIERS) {
addCombo(source, source + mod, "", ": " + EmojiData.EMOJI_DATA.getName(mod));
addCombo(source, prefix + mod + postfix, "", ": " + EmojiData.EMOJI_DATA_BETA.getName(mod));
}
}

int single = UnicodeSet.getSingleCodePoint(source);
if (single == Integer.MAX_VALUE) {
return;
}

boolean isGenderBase = emoji_Gender_Base.contains(source);
if (isGenderBase) {
for (String gen : Emoji.GENDER_MARKERS) {
String genSuffix = Emoji.JOINER_STR + gen + Emoji.EMOJI_VARIANT_STRING;
String genPrefix = gen.equals(Emoji.MALE) ? "man " : "woman ";
addCombo(source, source + genSuffix, genPrefix, "");
if (isModBase) {
if (hasModifierBase) {
for (String mod : EmojiData.MODIFIERS) {
addCombo(source, source + mod + genSuffix, genPrefix, ": " + EmojiData.EMOJI_DATA.getName(mod));
addCombo(source, source + mod + genSuffix, genPrefix, ": " + EmojiData.EMOJI_DATA_BETA.getName(mod));
}
}
}
}
if (isGenderBase && isModBase) {
if (isGenderBase && hasModifierBase) {
addComment(source, "Combinations of gender and skin-tone produce 17 more emoji sequences.");
} else if (isGenderBase) {
addComment(source, "Combinations of gender and skin-tone produce 2 more emoji sequences.");
} else if (isModBase) {
} else if (hasModifierBase) {
addComment(source, "Combinations of gender and skin-tone produce 5 more emoji sequences.");
}
// Comment=There will be 55 emoji sequences with combinations of gender and skin-tone
Expand Down Expand Up @@ -470,7 +491,7 @@ public int compare(String o1, String o2) {

String cat1 = getCategory(o1);
int catOrder1 = EmojiOrder.STD_ORDER.getGroupOrder(cat1);

String cat2 = getCategory(o2);
int catOrder2 = EmojiOrder.STD_ORDER.getGroupOrder(cat2);
if (catOrder1 != catOrder2) {
Expand Down Expand Up @@ -785,7 +806,7 @@ public String transform(String source) {
break main;
}
if (source.contains(EmojiData.ZWJ_HANDSHAKE_ZWJ)) {
temp = EmojiData.EMOJI_DATA.getFallbackName(source);
temp = EmojiData.EMOJI_DATA_BETA.getFallbackName(source);
break main;
}
switch(CountEmoji.Category.getBucket(source)) {
Expand Down Expand Up @@ -965,4 +986,16 @@ public String addEmojiVariants(String s1) {
public String getVersionString() {
return "candidates:" + DateFormat.getInstanceForSkeleton("yyyyMMdd", ULocale.ROOT).format(date);
}

/** We don't expect to have any more of these */
@Override
public UnicodeSet getExplicitGender() {
return UnicodeSet.EMPTY;
}

/** We don't expect to have any more of these */
@Override
public UnicodeSet getMultiPersonGroupings() {
return UnicodeSet.EMPTY;
}
}
6 changes: 5 additions & 1 deletion unicodetools/org/unicode/tools/emoji/CountEmoji.java
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ enum Attribute {
singleton, zwj, skin, gender, role, family, hair, dup
}

enum Category {
public enum Category {
character("char"),
keycap_seq,
flag_seq,
Expand Down Expand Up @@ -393,6 +393,10 @@ public String toString() {
public String toStringPlain() {
return displayName;
}
/** added to make migration easier */
static public Category getType(String s) {
return getBucket(s);
}
static public Category getBucket(String s) {
try {
String noVariants = EmojiData.removeEmojiVariants(s);
Expand Down
23 changes: 13 additions & 10 deletions unicodetools/org/unicode/tools/emoji/EmojiData.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,7 @@
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;

import javax.xml.stream.events.Characters;

import org.unicode.cldr.draft.FileUtilities;
import org.unicode.cldr.tool.GenerateBirth;
import org.unicode.cldr.util.Annotations;
import org.unicode.cldr.util.Annotations.AnnotationSet;
import org.unicode.cldr.util.CldrUtility;
Expand All @@ -43,10 +40,7 @@
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMultimap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.ImmutableSet.Builder;
import com.google.common.collect.ImmutableSetMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.SortedSetMultimap;
import com.google.common.collect.TreeMultimap;
import com.ibm.icu.dev.util.UnicodeMap;
import com.ibm.icu.lang.CharSequences;
Expand All @@ -55,9 +49,6 @@
import com.ibm.icu.text.Transform;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSet.SpanCondition;
import com.ibm.icu.text.UnicodeSetSpanner;
import com.ibm.icu.text.UnicodeSetSpanner.CountMethod;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.VersionInfo;

Expand Down Expand Up @@ -121,6 +112,7 @@ public enum DefaultPresentation {text, emoji}
private UnicodeSet otherHuman;
private UnicodeSet genderBase;
private UnicodeMap<String> toNeutral;
private UnicodeSet multiPersonGrouping;

public static final Splitter semi = Splitter.onPattern("[;#]").trimResults();
public static final Splitter semiOnly = Splitter.onPattern(";").trimResults();
Expand Down Expand Up @@ -506,11 +498,13 @@ private EmojiData(VersionInfo version) {

if (DEBUG) System.out.println("rawHairBases: " + rawHairBases.toPattern(false));

explicitGender.addAll(new UnicodeSet("[[👦-👩 👴 👵 🤴 👸 👲 🧕 🤵 👰 🤰 🤱 🎅 🤶 💃 🕺 🕴 👫-👭]]"))
explicitGender.addAll(new UnicodeSet("[[👦-👩 🧔 👴 👵 🤴 👸 👲 🧕 🤵 👰 🤰 🤱 🎅 🤶 💃 🕺 🕴 👫-👭]]"))
.freeze();

explicitHair.addAll(new UnicodeSet("[👱]"))
.freeze();

multiPersonGrouping = new UnicodeSet("[👯 🤼 👫-👭 💏 💑 👪 🤝]");

hairBases.addAll(rawHairBases)
.retainAll(modifierBases)
Expand Down Expand Up @@ -915,6 +909,7 @@ public UnicodeSet getSortingChars() {
}

public static final EmojiData EMOJI_DATA = of(Emoji.VERSION_TO_GENERATE);
public static final EmojiData EMOJI_DATA_BETA = of(Emoji.VERSION_BETA);

public UnicodeSet getFlagSequences() {
return flagSequences;
Expand Down Expand Up @@ -1387,20 +1382,23 @@ public static void main(String[] args) {
UnicodeSet explicitGendered = new UnicodeSet()
.addAll(e11a.maleToOther.keySet())
.addAll(e11a.femaleToOther.keySet())
.add(new UnicodeSet("[🧔]"))
.freeze();

UnicodeSet gendered = new UnicodeSet()
.addAll(e11a.maleToOther.keySet())
.addAll(e11a.femaleToOther.keySet())
.addAll(e11a.otherHuman)
.freeze();

UnicodeSet people = new UnicodeSet()
.addAll(EmojiOrder.BETA_ORDER.majorGroupings.getSet(MajorGroup.People))
.removeAll(EmojiOrder.BETA_ORDER.charactersToOrdering.getSet("body"))
.removeAll(EmojiOrder.BETA_ORDER.charactersToOrdering.getSet("emotion"))
.removeAll(EmojiOrder.BETA_ORDER.charactersToOrdering.getSet("clothing"))
.retainAll(e11a.allEmojiWithoutDefectives)
.freeze();

diff2("gendered", gendered, "people", people);

System.out.println("genderBase:\t" + e11a.getGenderBase().size() + "\t" + e11a.getGenderBase().toPattern(false));
Expand Down Expand Up @@ -1891,4 +1889,9 @@ static UnicodeSet getWithoutMods(UnicodeSet chars) {
public UnicodeSet getGenderBase() {
return genderBase;
}

@Override
public UnicodeSet getMultiPersonGroupings() {
return multiPersonGrouping;
}
}
2 changes: 2 additions & 0 deletions unicodetools/org/unicode/tools/emoji/EmojiDataSource.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,5 +55,7 @@ public default UnicodeSet getEmojiForSortRules() {

public String addEmojiVariants(String s1);
public String getVersionString();
public UnicodeSet getExplicitGender();
public UnicodeSet getMultiPersonGroupings();
}

12 changes: 12 additions & 0 deletions unicodetools/org/unicode/tools/emoji/EmojiDataSourceCombined.java
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,18 @@ public String addEmojiVariants(String s1) {
public String getVersionString() {
return emojiData.getVersion() + " + " + candidates.getVersionString();
}

@Override
public UnicodeSet getExplicitGender() {
return add(emojiData.getExplicitGender(),
candidates.getExplicitGender());
}

@Override
public UnicodeSet getMultiPersonGroupings() {
return add(emojiData.getMultiPersonGroupings(),
candidates.getMultiPersonGroupings());
}

// public static void main(String[] args) {
// UnicodeSet allChars = EMOJI_DATA.getAllEmojiWithDefectives();
Expand Down
9 changes: 8 additions & 1 deletion unicodetools/org/unicode/tools/emoji/GenerateEmojiData.java
Original file line number Diff line number Diff line change
Expand Up @@ -129,17 +129,24 @@ public static <T> void printData(UnicodeMap<String> extraNames) throws IOExcepti

try (TempPrintWriter outText2 = new TempPrintWriter(OUTPUT_DIR, "internal/emoji-internal.txt")) {
UnicodeSet emojiGenderBase = EmojiDataSourceCombined.EMOJI_DATA.getGenderBases();
UnicodeSet emojiExplicitGender = EmojiDataSourceCombined.EMOJI_DATA.getExplicitGender();
UnicodeSet emojiMultiPersonGroupings = EmojiDataSourceCombined.EMOJI_DATA.getMultiPersonGroupings();
outText2.println(Utility.getBaseDataHeader("emoji-internal", 51, "Emoji Data Internal", Emoji.VERSION_STRING));


int width = maxLength("Emoji_Gender_Base");
int width = maxLength("Emoji_Gender_Base",
"Emoji_Explicit_Gender",
"Multi_Person_Groupings"
);

// outText2.println("# Warning: the format has changed from Version 1.0");
outText2.println("# Format: ");
outText2.println("# <codepoint(s)> ; <property> # <comments> ");
outText2.println("# Note: there is no guarantee as to the structure of whitespace or comments");
outText2.println(ORDERING_NOTE);
printer.show(outText2, "Emoji_Gender_Base", null, width, 14, emojiGenderBase, true, true, false);
printer.show(outText2, "Emoji_Explicit_Gender", null, width, 14, emojiExplicitGender, true, true, false);
printer.show(outText2, "Multi_Person_Groupings", null, width, 14, emojiMultiPersonGroupings, true, true, false);
outText2.println("\n#EOF");
}

Expand Down
Loading

0 comments on commit a3a7054

Please sign in to comment.