From ddf9c7d1337a2db48ae4ae0fab1b95b8cc92f277 Mon Sep 17 00:00:00 2001 From: tormol Date: Sat, 26 Mar 2016 15:04:43 +0100 Subject: [PATCH 1/4] Convert `Ascii` into an enum. Allows static initialization, nicer pattern matches and comparisons. Naming conventions: * alphabetic: just use it. breaks CamelCase convention, but worth it for simplicity. * digits: prepend _ to make them valid identifiers, but still short. * non-alphanumeric but visible: Use Wikipedia names CamelCased, but remove -Mark endings. * rarely used control codes: use uppercase acronym to deter use. * more commonly used control codes: Expand and CamelCase acronym to preserve meaning, eg LineFeed not NewLine or LFneBreak. I'm not certain which control codes should be expanded; Currently BEL and DEL are, but should they? I'm not sure whether this is a breaking change. --- src/ascii.rs | 224 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 197 insertions(+), 27 deletions(-) diff --git a/src/ascii.rs b/src/ascii.rs index 4cef347..7ecf1f3 100644 --- a/src/ascii.rs +++ b/src/ascii.rs @@ -1,12 +1,183 @@ +use std::mem::transmute; use std::fmt; #[cfg(feature="unstable")] use std::ascii::AsciiExt; use AsciiCast; +#[allow(non_camel_case_types)] /// Datatype to hold one ascii character. It wraps a `u8`, with the highest bit always zero. #[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash, Copy)] -pub struct Ascii { chr: u8 } +#[repr(u8)] +pub enum Ascii { + /**`'\0'`*/NUL = 0, + /** [Start Of Heading](http://en.wikipedia.org/wiki/Start_of_Heading) + */ SOH = 1, + /** [Start Of teXt](http://en.wikipedia.org/wiki/Start_of_Text) + */ SOX = 2, + /** [End Of teXt](http://en.wikipedia.org/wiki/End-of-Text_character) + */ ETX = 3, + /** [End Of Transmission](http://en.wikipedia.org/wiki/End-of-Transmission_character) + */ EOT = 4, + /** [Enquiry](http://en.wikipedia.org/wiki/Enquiry_character) + */ ENQ = 5, + /** [Acknowledgement](http://en.wikipedia.org/wiki/Acknowledge_character) + */ ACK = 6, + /** [bell / alarm / audible](http://en.wikipedia.org/wiki/Bell_character) + + '\a' is not supported by Rust. + */ Bell = 7, + /** [Backspace character](http://en.wikipedia.org/wiki/Backspace) + + '\b' is not supported by Rust. + */ BackSpace = 8, + /**`'\t'`*/Tab = 9, + /**`'\n'`*/LineFeed = 10, + /** [Vertical tab](http://en.wikipedia.org/wiki/Vertical_Tab) + + '\v' is not supported by Rust. + */ VT = 11, + /** [Form Feed](http://en.wikipedia.org/wiki/Form_Feed) + + '\f' is not supported by Rust. + */ FF = 12, + /**`'\r'`*/CarriageReturn = 13, + /** [Shift In](http://en.wikipedia.org/wiki/Shift_Out_and_Shift_In_characters) + */ SI = 14, + /** [Shift Out](http://en.wikipedia.org/wiki/Shift_Out_and_Shift_In_characters) + */ SO = 15, + /** [Data Link Escape](http://en.wikipedia.org/wiki/Data_Link_Escape) + */ DLE = 16, + /** [Device control 1, often XON](http://en.wikipedia.org/wiki/Device_Control_1) + */ DC1 = 17, + /** Device control 2 + */ DC2 = 18, + /** Device control 3, Often XOFF + */ DC3 = 19, + /** Device control 4 + */ DC4 = 20, + /** [Negative Acknowledgement](http://en.wikipedia.org/wiki/Negative-acknowledge_character) + */ NAK = 21, + /** [Synchronous idle](http://en.wikipedia.org/wiki/Synchronous_Idle) + */ SYN = 22, + /** [End of Transmission Block](http://en.wikipedia.org/wiki/End-of-Transmission-Block_character) + */ ETB = 23, + /** [Cancel](http://en.wikipedia.org/wiki/Cancel_character) + */ CAN = 24, + /** [End of Medium](http://en.wikipedia.org/wiki/End_of_Medium) + */ EM = 25, + /** [Substitute](http://en.wikipedia.org/wiki/Substitute_character) + */ SUB = 26, + /** [Escape](http://en.wikipedia.org/wiki/Escape_character) + + '\e' is not supported in Rust. + */ Escape = 27, + /** [File Separator](http://en.wikipedia.org/wiki/File_separator) + */ FS = 28, + /** [Group Separator](http://en.wikipedia.org/wiki/Group_separator) + */ GS = 29, + /** [Record Separator](http://en.wikipedia.org/wiki/Record_separator) + */ RS = 30, + /** [Unit Separator](http://en.wikipedia.org/wiki/Unit_separator) + */ US = 31, + /**`' '`*/ Space = 32, + /**`'!'`*/ Exclamation = 33, + /**`'"'`*/ Quotation = 34, + /**`'''`*/ Hash = 35, + /**`'$'`*/ Dollar = 36, + /**`'%'`*/ Percent = 37, + /**`'&'`*/ Ampersand = 38, + /**`'''`*/ Apostrophe = 39, + /**`'('`*/ ParenOpen = 40, + /**`')'`*/ ParenClose = 41, + /**`'*'`*/ Asterisk = 42, + /**`'+'`*/ Plus = 43, + /**`','`*/ Comma = 44, + /**`'-'`*/ Minus = 45, + /**`'.'`*/ Dot = 46, + /**`'/'`*/ Slash = 47, + /**`'0'`*/ _0 = 48, + /**`'1'`*/ _1 = 49, + /**`'2'`*/ _2 = 50, + /**`'3'`*/ _3 = 51, + /**`'4'`*/ _4 = 52, + /**`'5'`*/ _5 = 53, + /**`'6'`*/ _6 = 54, + /**`'7'`*/ _7 = 55, + /**`'8'`*/ _8 = 56, + /**`'9'`*/ _9 = 57, + /**`':'`*/ Colon = 58, + /**`';'`*/ SemiColon = 59, + /**`'<'`*/ LessThan = 60, + /**`'='`*/ Equal = 61, + /**`'>'`*/ GreaterThan = 62, + /**`'?'`*/ Question = 63, + /**`'@'`*/ At = 64, + /**`'A'`*/ A = 65, + /**`'B'`*/ B = 66, + /**`'C'`*/ C = 67, + /**`'D'`*/ D = 68, + /**`'E'`*/ E = 69, + /**`'F'`*/ F = 70, + /**`'G'`*/ G = 71, + /**`'H'`*/ H = 72, + /**`'I'`*/ I = 73, + /**`'J'`*/ J = 74, + /**`'K'`*/ K = 75, + /**`'L'`*/ L = 76, + /**`'M'`*/ M = 77, + /**`'N'`*/ N = 78, + /**`'O'`*/ O = 79, + /**`'P'`*/ P = 80, + /**`'Q'`*/ Q = 81, + /**`'R'`*/ R = 82, + /**`'S'`*/ S = 83, + /**`'T'`*/ T = 84, + /**`'U'`*/ U = 85, + /**`'V'`*/ V = 86, + /**`'W'`*/ W = 87, + /**`'X'`*/ X = 88, + /**`'Y'`*/ Y = 89, + /**`'Z'`*/ Z = 90, + /**`'['`*/ BracketOpen = 91, + /**`'\'`*/ BackSlash = 92, + /**`']'`*/ BracketClose = 93, + /**`'_'`*/ Caret = 94, + /**`'_'`*/ UnderScore = 95, + /**`'`'`*/ Grave = 96, + /**`'a'`*/ a = 97, + /**`'b'`*/ b = 98, + /**`'c'`*/ c = 99, + /**`'d'`*/ d = 100, + /**`'e'`*/ e = 101, + /**`'f'`*/ f = 102, + /**`'g'`*/ g = 103, + /**`'h'`*/ h = 104, + /**`'i'`*/ i = 105, + /**`'j'`*/ j = 106, + /**`'k'`*/ k = 107, + /**`'l'`*/ l = 108, + /**`'m'`*/ m = 109, + /**`'n'`*/ n = 110, + /**`'o'`*/ o = 111, + /**`'p'`*/ p = 112, + /**`'q'`*/ q = 113, + /**`'r'`*/ r = 114, + /**`'s'`*/ s = 115, + /**`'t'`*/ t = 116, + /**`'u'`*/ u = 117, + /**`'v'`*/ v = 118, + /**`'w'`*/ w = 119, + /**`'x'`*/ x = 120, + /**`'y'`*/ y = 121, + /**`'z'`*/ z = 122, + /**`'{'`*/ CurlyBraceOpen = 123, + /**`'|'`*/ VerticalBar = 124, + /**`'}'`*/ CurlyBraceClose = 125, + /**`'~'`*/ Tilde = 126, + /**[Delete](http://en.wikipedia.org/wiki/Delete_character) + */ Delete = 127, +} impl Ascii { /// Constructs an Ascii character from a `char`. @@ -24,7 +195,7 @@ impl Ascii { #[inline] pub fn from(ch: char) -> Result { if ch as u32 <= 0x7F { - return Ok( Ascii { chr: ch as u8 }); + return Ok(unsafe{ ch.to_ascii_nocheck() }); } Err(()) } @@ -45,7 +216,7 @@ impl Ascii { #[inline] pub fn from_byte(ch: u8) -> Result { if ch <= 0x7F { - return Ok( Ascii { chr: ch }); + return Ok(unsafe{ ch.to_ascii_nocheck() }); } Err(()) } @@ -53,13 +224,13 @@ impl Ascii { /// Converts an ascii character into a `u8`. #[inline] pub fn as_byte(&self) -> u8 { - self.chr + unsafe{ transmute(*self) } } /// Converts an ascii character into a `char`. #[inline] pub fn as_char(&self) -> char { - self.chr as char + self.as_byte() as char } // the following methods are like ctype, and the implementation is inspired by musl @@ -67,13 +238,14 @@ impl Ascii { /// Check if the character is a letter (a-z, A-Z) #[inline] pub fn is_alphabetic(&self) -> bool { - (self.chr >= 0x41 && self.chr <= 0x5A) || (self.chr >= 0x61 && self.chr <= 0x7A) + (self >= &Ascii::a && self <= &Ascii::z) || + (self >= &Ascii::A && self <= &Ascii::Z) } /// Check if the character is a number (0-9) #[inline] pub fn is_digit(&self) -> bool { - self.chr >= 0x30 && self.chr <= 0x39 + self >= &Ascii::_0 && self <= &Ascii::_9 } /// Check if the character is a letter or number @@ -85,7 +257,7 @@ impl Ascii { /// Check if the character is a space or horizontal tab #[inline] pub fn is_blank(&self) -> bool { - self.chr == b' ' || self.chr == b'\t' + *self == Ascii::Space || *self == Ascii::Tab } /// Check if the character is a control character @@ -101,7 +273,7 @@ impl Ascii { /// ``` #[inline] pub fn is_control(&self) -> bool { - self.chr < 0x20 || self.chr == 0x7F + self.as_byte() < 0x20 || *self == Ascii::Delete } /// Checks if the character is printable (except space) @@ -116,7 +288,7 @@ impl Ascii { /// ``` #[inline] pub fn is_graph(&self) -> bool { - self.chr.wrapping_sub(0x21) < 0x5E + self.as_byte().wrapping_sub(0x21) < 0x5E } /// Checks if the character is printable (including space) @@ -131,7 +303,7 @@ impl Ascii { /// ``` #[inline] pub fn is_print(&self) -> bool { - self.chr.wrapping_sub(0x20) < 0x5F + self.as_byte().wrapping_sub(0x20) < 0x5F } /// Checks if the character is alphabetic and lowercase @@ -146,7 +318,7 @@ impl Ascii { /// ``` #[inline] pub fn is_lowercase(&self) -> bool { - self.chr.wrapping_sub(b'a') < 26 + self.as_byte().wrapping_sub(b'a') < 26 } /// Checks if the character is alphabetic and uppercase @@ -161,7 +333,7 @@ impl Ascii { /// ``` #[inline] pub fn is_uppercase(&self) -> bool { - self.chr.wrapping_sub(b'A') < 26 + self.as_byte().wrapping_sub(b'A') < 26 } /// Checks if the character is punctuation @@ -193,7 +365,7 @@ impl Ascii { /// ``` #[inline] pub fn is_hex(&self) -> bool { - self.is_digit() || (self.chr | 32u8).wrapping_sub(b'a') < 6 + self.is_digit() || (self.as_byte() | 32u8).wrapping_sub(b'a') < 6 } } @@ -219,25 +391,25 @@ impl AsciiExt for Ascii { } fn to_ascii_uppercase(&self) -> Ascii { - Ascii{chr: self.chr.to_ascii_uppercase()} + unsafe{ self.as_byte().to_ascii_uppercase().to_ascii_nocheck() } } fn to_ascii_lowercase(&self) -> Ascii { - Ascii{chr: self.chr.to_ascii_lowercase()} + unsafe{ self.as_byte().to_ascii_uppercase().to_ascii_nocheck() } } fn eq_ignore_ascii_case(&self, other: &Self) -> bool { - self.chr.eq_ignore_ascii_case(&other.chr) + self.as_byte().eq_ignore_ascii_case(&other.as_byte()) } #[inline] fn make_ascii_uppercase(&mut self) { - self.chr.make_ascii_uppercase() + *self = self.to_ascii_uppercase(); } #[inline] fn make_ascii_lowercase(&mut self) { - self.chr.make_ascii_lowercase() + *self = self.to_ascii_lowercase(); } } @@ -246,7 +418,7 @@ impl<'a> AsciiCast<'a> for u8 { #[inline] unsafe fn to_ascii_nocheck(&self) -> Ascii { - Ascii{ chr: *self } + transmute(*self) } } @@ -255,7 +427,7 @@ impl<'a> AsciiCast<'a> for char { #[inline] unsafe fn to_ascii_nocheck(&self) -> Ascii { - Ascii{ chr: *self as u8 } + (*self as u8).to_ascii_nocheck() } } @@ -266,10 +438,10 @@ mod tests { #[test] fn to_ascii() { - assert_eq!(65_u8.to_ascii(), Ok(Ascii { chr: 65_u8 })); + assert_eq!(65_u8.to_ascii(), Ok(Ascii::A)); assert_eq!(255_u8.to_ascii(), Err(())); - assert_eq!('A'.to_ascii(), Ok(Ascii { chr: 65_u8 })); + assert_eq!('A'.to_ascii(), Ok(Ascii::A)); assert_eq!('λ'.to_ascii(), Err(())); } @@ -302,13 +474,11 @@ mod tests { #[test] fn fmt_display_ascii() { - let s = Ascii { chr: b't' }; - assert_eq!(format!("{}", s), "t".to_string()); + assert_eq!(format!("{}", Ascii::t), "t".to_string()); } #[test] fn fmt_debug_ascii() { - let c = Ascii { chr: b't' }; - assert_eq!(format!("{:?}", c), "'t'".to_string()); + assert_eq!(format!("{:?}", Ascii::t), "'t'".to_string()); } } From 984cc28769f7e1297ee44f5b0f730ac87d2738fd Mon Sep 17 00:00:00 2001 From: tormol Date: Sat, 26 Mar 2016 15:38:05 +0100 Subject: [PATCH 2/4] Optimize `Ascii.is_alphabetic()` --- src/ascii.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ascii.rs b/src/ascii.rs index 7ecf1f3..71edafc 100644 --- a/src/ascii.rs +++ b/src/ascii.rs @@ -238,8 +238,8 @@ impl Ascii { /// Check if the character is a letter (a-z, A-Z) #[inline] pub fn is_alphabetic(&self) -> bool { - (self >= &Ascii::a && self <= &Ascii::z) || - (self >= &Ascii::A && self <= &Ascii::Z) + let c = self.as_byte() | 0b010_0000;// Turns uppercase into lowercase. + c >= b'a' && c <= b'z' } /// Check if the character is a number (0-9) From 0bcced9f57f11cca111e28c50ab3b743ed9f5e6e Mon Sep 17 00:00:00 2001 From: tormol Date: Mon, 28 Mar 2016 05:05:18 +0200 Subject: [PATCH 3/4] Remove unnecessary transmute and expand some unsafe blocks --- src/ascii.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/ascii.rs b/src/ascii.rs index 71edafc..3e18e7d 100644 --- a/src/ascii.rs +++ b/src/ascii.rs @@ -194,9 +194,9 @@ impl Ascii { /// ``` #[inline] pub fn from(ch: char) -> Result { - if ch as u32 <= 0x7F { - return Ok(unsafe{ ch.to_ascii_nocheck() }); - } + unsafe{if ch as u32 <= 0x7F { + return Ok(ch.to_ascii_nocheck()); + }} Err(()) } @@ -215,16 +215,16 @@ impl Ascii { /// ``` #[inline] pub fn from_byte(ch: u8) -> Result { - if ch <= 0x7F { - return Ok(unsafe{ ch.to_ascii_nocheck() }); - } + unsafe{if ch <= 0x7F { + return Ok(ch.to_ascii_nocheck()); + }} Err(()) } /// Converts an ascii character into a `u8`. #[inline] pub fn as_byte(&self) -> u8 { - unsafe{ transmute(*self) } + *self as u8 } /// Converts an ascii character into a `char`. From c290db0b518b33a79d842965c5b141cb79cdd9d8 Mon Sep 17 00:00:00 2001 From: tormol Date: Mon, 28 Mar 2016 05:00:26 +0200 Subject: [PATCH 4/4] Make Ascii variant names consistent Control code names get expanded iff they have an escape code in [this table](. Else the two-or-three letter uppercase code name is used. )# modified: src/ascii.rs --- src/ascii.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/ascii.rs b/src/ascii.rs index 3e18e7d..ecdf6d8 100644 --- a/src/ascii.rs +++ b/src/ascii.rs @@ -10,7 +10,7 @@ use AsciiCast; #[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash, Copy)] #[repr(u8)] pub enum Ascii { - /**`'\0'`*/NUL = 0, + /**`'\0'`*/Null = 0, /** [Start Of Heading](http://en.wikipedia.org/wiki/Start_of_Heading) */ SOH = 1, /** [Start Of teXt](http://en.wikipedia.org/wiki/Start_of_Text) @@ -36,11 +36,11 @@ pub enum Ascii { /** [Vertical tab](http://en.wikipedia.org/wiki/Vertical_Tab) '\v' is not supported by Rust. - */ VT = 11, + */ VerticalTab = 11, /** [Form Feed](http://en.wikipedia.org/wiki/Form_Feed) '\f' is not supported by Rust. - */ FF = 12, + */ FormFeed = 12, /**`'\r'`*/CarriageReturn = 13, /** [Shift In](http://en.wikipedia.org/wiki/Shift_Out_and_Shift_In_characters) */ SI = 14, @@ -175,8 +175,8 @@ pub enum Ascii { /**`'|'`*/ VerticalBar = 124, /**`'}'`*/ CurlyBraceClose = 125, /**`'~'`*/ Tilde = 126, - /**[Delete](http://en.wikipedia.org/wiki/Delete_character) - */ Delete = 127, + /** [Delete](http://en.wikipedia.org/wiki/Delete_character) + */ DEL = 127, } impl Ascii { @@ -273,7 +273,7 @@ impl Ascii { /// ``` #[inline] pub fn is_control(&self) -> bool { - self.as_byte() < 0x20 || *self == Ascii::Delete + self.as_byte() < 0x20 || *self == Ascii::DEL } /// Checks if the character is printable (except space)