; This file was edited using TECkitMappingEditorU.exe v1.0.0.1 on 2/13/2007. ; Conversion Type = Legacy_to_from_Unicode ; Left-hand side font = Courier New;12 ; Right-hand side font = Ezra SIL;15.75 ; Main Window Position = 0,0,1280,809 ; Code Point Window Position = 880,52,374,831 ; SIL Hebrew Standard Encoding SILEzratoUni50.map copyright SIL 2003 ; with support for Standard or Display Encoding conversion to Unicode ; and return conversion from Unicode to a modified Standard Encoding ; (Note that displayable text direction must be reversed before running this conversion!) ; initial draft by Peter Kirk, December 2000 ; modified to use newer TECkit features, Jonathan Kew, January 2001 ; tweaked for current compiler, JK, Sept 2002 ; modified, Joan Wardell, Jan-April 2003 ; modified for NEW ORDER rearrangement Joan Wardell, July 2003-August 2003 ; added Return (Unicode -> SE) Joan Wardell, September 2003 Files will not be perfect. An ; exact return trip is not possible because the information needed is no longer there. ; assistance from Jonathan Kew Oct 2003 ; modified for added characters and recommendations in Unicode 4.1 and 5.0 cjs April 2006-March 2007 EncodingName "SIL-HEBREW_STANDARD-1997" ;jw DescriptiveName "SIL Hebrew Standard Encoding" ;jw Version "58" ;jw Contact "mailto: sil_fonts@sil.org" RegistrationAuthority "SIL International" ;jw RegistrationName "SILEzra" ;LHSFlags (Consonant-Vowel-CantOrder) ;jw ;RHSFlags (Consonant-Vowel-CantOrder) ;jw ; None of the standard orders apply to biblical Hebrew. Visual order means ; the text has been reversed to appear correctly in applications that ; cannot handle right-to-left scripts. An example is Hebrew SE or DE ; displayed in Word (pre-Word 2000). Logical order means the text is ; in spoken order (approx), unreversed. Neither NFC nor NFD are appropriate ; choices for biblical Hebrew displayed with any Ezra SIL font. ;jw ;Note that this program makes 4 passes now. 8/25/03 jw ;Note that this program makes 6 passes now. cjs March 2007 ;Initial pass to remove thinspaces, found in DE texts. jw 4/14/03 pass (Byte) 255 > ;thinspace deleted ;Pass to Split any plene vowels and do rearrangement 8/25/03 pass (Byte) ;attempting to list any mark possibly preceding a plene vowel (anything between consonant ; and plene, excluding dagesh, rafe, including any cant, masora, punctum, asterisk) ByteClass [any_mark_de] = (043 066 084 \ 089 090 094 .. 096 106 126 149 \ 152 .. 159 161 \ 178 179 181 180 189 .. 193 242 \ 245 247 249 250 253 \ 036 037 061 068 077 080 .. 082 \ 085 086 126 128 .. 144 \ 174 175 177 182 184 .. 186 188 ) ByteClass [vow_de] = ( 252 204 217 232 248 205 218 237 \ 233 206 219 239 \ 225 207 220 240 \ 243 221 236 241 \ 105 199 214 230 \ 069 197 212 227 \ 101 198 213 229 \ 097 194 209 222 \ 065 195 210 223 111 196 211 224 \ 079 200 215 \ 117 201 216 231 ) ;Split plene vowels (full spelling SE only, but possible) ;([any_mark_de]){0,2}=a 244 > 079 @a 119 ;sample syntax, note curly brackets! 8/22/03 ;NOTE that * will delete all but the first occ, unless you use @ on the output side. 8/22/03 ([any_mark_de]*)=a 203 > 069 @a 104 ([any_mark_de]*)=a 228 > 065 @a 104 ([any_mark_de]*)=a 235 > 101 @a 104 ([any_mark_de]*)=a 246 > 079 @a 104 ([any_mark_de]*)=a 202 > 069 @a 121 ([any_mark_de]*)=a 234 > 101 @a 121 ([any_mark_de]*)=a 238 > 105 @a 121 ;Split holem-vav to holem [any mark] vav ([any_mark_de]*)=a 244 > 079 @a 119 @a 244 < 079 ([any_mark_de]){0,2}=a 119 ;move rafe before any de vowel - no reversal as it should have been before vowel anyway [vow_de]=a 038=b > 038=b [vow_de]=a ;Convert DE data to SE, in preparation for Unicode conversion jw 8/14/03 pass (Byte) ByteClass [WS] = ( 009 010 013 032 176 ) ByteClass [cons] = ( 039 083 087 072 098 103 100 104 119 122 120 \ 088 121 107 108 109 110 115 118 \ 112 099 113 114 083 116 ) ;added all sins 10/6/03 ;deo= DE Only (exludes SE) vowels ByteClass [patahs_deo] = (194 209 222) ByteClass [kamets_deo] = (195 210 223 196 111 211 224) ;also converts qamets_o, no longer avail ByteClass [hireqs_deo] = (199 214 230) ByteClass [segols_deo] = (198 213 229) ByteClass [tseres_deo] = (197 212 227) ByteClass [holems_deo] = (200 215) ByteClass [qibbs_deo] = (201 216 231) ByteClass [shewas_deo] = (204 217 232 248 205 218 237) ;also converts all silent shewa, no longer avail ByteClass [h_patahs_deo] = (207 220 240) ByteClass [h_kamets_deo] = (236 221 241) ByteClass [h_segols_deo] = (206 219 239) ByteClass [vow_de] = ( 252 204 217 232 248 205 218 237 \ 233 206 219 239 \ 225 207 220 240 \ 243 221 236 241 \ 105 199 214 230 \ 069 197 212 227 \ 101 198 213 229 \ 097 194 209 222 \ 065 195 210 223 111 196 211 224 \ 079 200 215 \ 117 201 216 231 ) ByteClass [meteg] = ( 149 189 .. 193 ) ByteClass [meteg_se] = ( 149 149 149 149 149 149 ) ByteClass [holem] = (079 200 215) ByteClass [holem_se] = (079 079 079) ByteClass [munah] = (158 90 249) ByteClass [dr_de] = ( 067 070 071 073 074 075 076 183 208 \ 038 ) ByteClass [pp_de] = ( 077 132 138 152 155 ) ;plene SE: ;plene parts SE: ; 244 holem-vav ; he = 104 ; 251 shureq ; yod = 121 ; 202 tsere-yod ; vav = 119 ; 234 segol-yod ; 238 hiriq-yod ; hiriq = 105 ; tsere = 69 ; 203 tsere-he ; qamats = 65 ; 228 qamats-he ; segol = 101 ; 235 segol-he ; holem = 079 ; 246 holem-he ;Split hataf-metegs NO! need to preserve these for later special treatment cjs April 2006 ;254 > 225 149 ;102 > 233 149 ;but the reverse is needed in the back conversion cjs May 2006 ;NO! we have to sacrifice the distinction between medial and left in the back conversion to get pre-positive cants right ;225 149 < 254 ;233 149 < 102 ;Move DE data to SE ;Dagesh 183 > 208 67 > 208 70 > 208 71 > 208 73 > 208 74 > 208 75 > 208 76 > 208 ;Accent 180 > 181 ;Asterisk 43 > 42 ;(high asterisk no longer available, converting to regular asterisk) ;High Cants 174 > 128 175 > 129 177 > 130 182 > 131 184 > 132 185 > 133 186 > 134 188 > 135 68 > 136 61 > 137 77 > 138 85 > 139 80 > 140 81 > 141 82 > 142 86 > 144 ;Low Cants 189 > 149 190 > 149 190 > 149 191 > 149 192 > 149 193 > 149 96 > 152 95 > 153 106 > 154 89 > 155 247 > 155 66 > 156 245 > 156 84 > 157 90 > 158 249 > 158 94 > 159 250 > 159 242 > 161 253 > 161 ;Convert DE Vowels to SE [patahs_deo] > 097 [kamets_deo] > 065 [hireqs_deo] > 105 [segols_deo] > 101 [tseres_deo] > 069 [holems_deo] > 079 [qibbs_deo] > 117 [shewas_deo] > 252 [h_patahs_deo] > 225 [h_kamets_deo] > 243 [h_segols_deo] > 233 ;Convert shureq to vav+dagesh 251 > 119 208 ;rearrange pre-positive metheg on shureq to follow - word-initial only - only needed for de text cjs May 2006 ;de text sometimes has both pre-pos cant and metheg before word-initial shureq 149 251 / ([WS] | #) [pp_de]? _ > 119 208 149 ;Split DE Finals combinations into intermediate state for now 167 > 107 65 168 > 110 65 169 > 112 208 170 > 107 252 172 > 107 208 173 > 107 208 65 ;108 [meteg] [holem] [munah] > 108 079 149 158 ;rearrange lamed-meteg-holem to lamed-holem-meteg. ; need to find out where holem is in relation to right meteg. L-met-O-munah 1 occ, O95-11 occs which are ; encoded as meteg-holem in Ezra SIL SE/DE. ; No need to use PUA right meteg, just get order correct: meteg after O-holem. 8/14/03 ;TEST REMOVAL. Keep. No, try class rule below to get all occs. 8/25/03 ;108 191 215 90 > 108 079 149 158 ;this is 1 occ. Deut 5:8:1, LO9574 jw 4/1/03 [meteg]=a [holem]=b / [cons] [dr_de]? _ > [holem]=b [meteg]=a ;Not possible to do return trip to 95, there is no context and there are 196 occs of meteg(75) with holem. ;These look identical in Unicode. This conversion table makes it so you cannot have a right meteg on holem. ;That doesn't make sense anyway. ;RETURN TRIP ; 'Chapter' < 'Ðhapter' ;compiler rejects quoted strings so change to 67 104 97 112 116 101 114 < 208 104 97 112 116 101 114 ;added to make data processed through our cc tables a bit cleaner. ;********************************************************************************************************************* ;Pass to convert from byte (as in, ASCII & upper ASCII Ezra SIL encoding) data to Unicode. ;Note that this pass originally was written ;to handle either DE or SE data. At present, no DE data should be passing through here ;but I'm not certain. 8/25/03 jw pass (Byte_Unicode) ;jw ByteDefault 063 ; question mark UniDefault replacement_character ;FFFD ; in the mapping rules ; <> means a bidirectional rule ; > means only byte->Unicode ; < means only Unicode->byte ; General order of encoding in SIL encoding {parens() = optional}: ; consonant (dagesh/rafe)(vowel point)(meteg|cantillation). ; Exceptions: Right meteg comes before vowel point. ; Prepositive accents come before word-initial consonant. ; In the few cases where meteg and cantillation co-occur, the order is not fixed. ; Cantillation can be either high-low or low-high, or mixed. It is not fixed. ; CLASS DEFINITIONS ; there are separate namespaces for Byte and Unicode classes, ; allowing us to use the same name for classes with corresponding content ; control characters ByteClass [CTL] = ( 0x00 .. 0x1f 0x7f ) UniClass [CTL] = ( U+0000 .. U+001f U+007f ) ; ASCII characters, excluding space ByteClass [ascii] = ( 0x21 .. 0x7e ) UniClass [ascii] = ( U+0021 .. U+007e ) ; alphanumeric ASCII characters, legal in SF markers ByteClass [anum] = ( 0x30 .. 0x39 0x41 .. 0x5A 0x61 .. 0x7A ) UniClass [anum] = ( U+0030 .. U+0039 U+0041 .. U+005A U+0061 .. U+007A ) ; numeric ASCII characters, found in verse numbers jw 3/13/03 ByteClass [num] = ( 048 .. 057 ) UniClass [num] = ( U+0030 .. U+0039 ) ;uncommented 7/24/03 jw ; whitespace (excluding 255 thin space which is used for spacing within words) ByteClass [WS] = ( 009 010 013 032 176 ) ; deleted 160, can't remember what it is! 7/24/03 jw UniClass [WS] = ( U+0009 U+000A U+000D U+0020 U+00A0 ) ;7/24/03 jw ; whitespace plus word dividing punctuation (including maqqef) and stream start/end ;8/22/03 changed name from WSP to OOB for OUT OF BOUNDS. This more clearly reflects what is contained here - ;any mark which is not Hebrew, or would legitimately force a non-final to become final, such as numbers. This ;is also used to identify pre-positive marks. 8/25/03 ByteClass [OOB] = ( 009 010 013 032 .. 035 040 041 042 044 .. 047 058 059 060 062 063 \ 091 092 093 123 124 125 150 151 160 176 145 .. 148 171 187 048 ..057);jw added quotes, numbers, dashes, setuma/petuha ;copied here from last pass 9/12/03 jw,used for RETURN context, may not match above UniClass [OOB] = ( U+0009 U+000A U+000D U+0020 U+00A0 U+0020 .. U+0040 U+05F3 U+05F4 \ U+05BE U+05C0 U+05C3 U+005B .. U+0060 \ U+007B .. U+007E U+00A7 U+00AB U+00AF \ U+00B6 U+00BB U+00BF U+2000 .. U+200A U+2010 .. U+2021 \ U+2039 U+203A ) ; all Hebrew letters etc in Unicode ;UniClass [heb] = ( U+0591 .. U+05F4 U+FB1D .. U+FB4F ) ; consonants (in Hebrew alphabetical order including unpointed shin, no non-final or final forms, jw; excl sin, shin w/dots, non-finals) ;ByteClass [cons] = ( 039 098 103 100 104 119 122 120 \ ; 088 121 107 108 109 110 115 118 \ ; 112 099 113 114 083 116 ) UniClass [cons] = ( U+05D0 U+05D1 U+05D2 U+05D3 U+05D4 U+05D5 U+05D6 U+05D7 \ U+05D8 U+05D9 U+05DB U+05DC U+05DE U+05E0 U+05E1 U+05E2 \ U+05E4 U+05E6 U+05E7 U+05E8 U+05E9 U+05EA ) ; non-final forms of consonants with final forms ByteClass [nonf] = ( 107 109 110 112 099 ) UniClass [nonf] = ( U+05DB U+05DE U+05E0 U+05E4 U+05E6 ) ; final forms of consonants UniClass [final] = ( U+05DA U+05DD U+05DF U+05E3 U+05E5 ) ; all consonants, including shin/sins, nonfinal, Display Encoding final forms, plene forms of vowels. Should be used in any context searching for consonants or forms that take space on the baseline. Do not use for conversion. ;jw 4/28/03 9/18/03 deleted line 5 duplicates ByteClass [allcons] = (039 098 103 100 104 119 122 120 \ 088 121 107 108 109 110 115 118 \ 112 099 113 114 083 116 \ 072 087 \ 162 163 164 165 166 167 168 169 170 172 173 \ 244 251 \ 202 203 228 234 235 238 246 ) ; meteg - all DE varieties ByteClass [meteg] = ( 149 189 .. 193 ) ; all points ByteClass [point] = ( 036 .. 038 043 061 065 .. 071 073 .. 077 079 .. 082 084 .. 086 \ 089 090 094 .. 097 101 102 105 106 111 117 126 128 .. 144 149 \ 152 .. 159 161 174 175 177 .. 186 188 .. 201 204 .. 225 \ 227 229 .. 233 236 237 239 .. 243 245 247 .. 250 252 .. 254 ) ;jw ;added punctum 178-179 ; prepositive accents in SE and their Unicode equivalents ByteClass [pp_se] = ( 155 138 152 132 );jw chg 77 to 138 geresh SE UniClass [pp_se] = ( U+05AD U+059D U+059A U+05A0 ) ; prepositive accents in DE and their Unicode equivalents ;ByteClass [pp_de] = ( 155 089 247 077 138 152 096 132 184 ) ;UniClass [pp_de] = ( U+05AD U+05AD U+05AD U+059D U+059D U+059A U+059A U+05A0 U+05A0 ) ; not sure if it is permitted to have duplicate class members ; but it is useful here in a class which appears only as output of rule ; indeed the following table depends on this, would be much more complicated otherwise ; vowel points in SE and their Unicode equivalents ByteClass [vow_se] = ( 252 233 225 243 105 069 101 097 065 079 117 ) UniClass [vow_se] = ( U+05B0 .. U+05B9 U+05BB ) ; vowel points in DE and their Unicode equivalents (one line per vowel) ; note that SE silent shewa and qamets o are treated as DE only here ByteClass [vow_de] = ( 252 204 217 232 248 205 218 237 \ 233 206 219 239 \ 225 207 220 240 \ 243 221 236 241 \ 105 199 214 230 \ 069 197 212 227 \ 101 198 213 229 \ 097 194 209 222 \ 065 195 210 223 111 196 211 224 \ 079 200 215 \ 117 201 216 231 ) UniClass [vow_de] = ( U+05B0 U+05B0 U+05B0 U+05B0 U+05B0 U+05B0 U+05B0 U+05B0 \ U+05B1 U+05B1 U+05B1 U+05B1 \ U+05B2 U+05B2 U+05B2 U+05B2 \ U+05B3 U+05B3 U+05B3 U+05B3 \ U+05B4 U+05B4 U+05B4 U+05B4 \ U+05B5 U+05B5 U+05B5 U+05B5 \ U+05B6 U+05B6 U+05B6 U+05B6 \ U+05B7 U+05B7 U+05B7 U+05B7 \ U+05B8 U+05B8 U+05B8 U+05B8 U+05B8 U+05B8 U+05B8 U+05B8 \ U+05B9 U+05B9 U+05B9 \ U+05BB U+05BB U+05BB U+05BB ) ; dagesh and rafe in SE and their Unicode equivalents ByteClass [dr_se] = ( 208 038 ) UniClass [dr_se] = ( U+05BC U+05BF ) ; dagesh and rafe in DE and their Unicode equivalents ByteClass [dr_de] = ( 067 070 071 073 074 075 076 183 208 \ 038 ) UniClass [dr_de] = ( U+05BC U+05BC U+05BC U+05BC U+05BC U+05BC U+05BC U+05BC U+05BC \ U+05BF ) ; shin and sin ByteClass [shs] = ( 072 087 ) ; shin and sin dots UniClass [shindots] = ( U+05C1 U+05C2 ) ;start storage by jw ; low vowels CCAT ; A F I E " U : :a :f :e ;ByteClass [low_vow_de] = (252 204 217 232 248 205 218 237 \ ; 233 206 219 239 \ ; 225 207 220 240 \ ; 243 221 236 241 \ ; 105 199 214 230 \ ; 069 197 212 227 \ ; 101 198 213 229 \ ; 097 194 209 222 \ ; 065 195 210 223 111 196 211 224 \ ; 117 201 216 231 ) ;UniClass [low_vow_de] = (U+05B0 U+05B0 U+05B0 U+05B0 U+05B0 U+05B0 U+05B0 U+05B0 \ ; U+05B1 U+05B1 U+05B1 U+05B1 \ ; U+05B2 U+05B2 U+05B2 U+05B2 \ ; U+05B3 U+05B3 U+05B3 U+05B3 \ ; U+05B4 U+05B4 U+05B4 U+05B4 \ ; U+05B5 U+05B5 U+05B5 U+05B5 \ ; U+05B6 U+05B6 U+05B6 U+05B6 \ ; U+05B7 U+05B7 U+05B7 U+05B7 \ ; U+05B8 U+05B8 U+05B8 U+05B8 U+05B8 U+05B8 U+05B8 U+05B8 \ ; U+05BB U+05BB U+05BB U+05BB ) ; low cants CCAT ; 35 70 71 72 73 74 75 91 92 93 94 95 ByteClass [low_p_se] = (149 \ 152 .. 159 161 \ 179 179 ) ;changed U+0323 to U+05C5 lower dot cjs April 2006 UniClass [low_p_se] = (U+05BD \ U+05A4 U+059B U+05AA U+0596 U+05A5 U+05A6 U+05A3 U+05A7 U+0591 \ U+05C5 U+0323 ) ;ByteClass [high_vow_de] = (079 200 215) ;UniClass [high_vow_de] = (U+05B9 U+05B9 U+05B9) ; high cants CCAT ; 24 33 44 60 61 62 63 64 65 80 81 82 83 84 85 ByteClass [high_p_se] = (036 037 \ 126 128 .. 144 \ 178 ) ;RE--ADDING THESE TWO GROUPS FOR HOLEM VAV RETURN PROCESSING! 9/12/03 jw ByteClass [any_p_se] = ( 036 037 \ 126 128 .. 144 \ 149 \ 152 .. 159 161 \ 178 179 179 ) ;changed U+0323 to U+05C5 lower dot UniClass [any_p_se] = ( U+0307 U+0308 \ U+05AF U+0597 U+0594 U+0592 U+05A9 U+05A0 U+059F U+05AB \ U+05A1 U+0595 U+05A8 U+059C U+059E U+05AC U+0598 U+0593 U+0599 U+05AE \ U+05BD \ U+05A4 U+059B U+05AA U+0596 U+05A5 U+05A6 U+05A3 U+05A7 U+0591 \ U+05C4 U+05C5 U+0323 ) ByteClass [holem] = (079 200 215) ;jw 4/17/03 ByteClass [munah] = (158 90 249) Define ZWNJ U+200C ;jw 8/4/03 Define ZWJ U+200D Define CGJ U+034F ByteClass [hiriq] = (105 199 214 230) ;jw 12/10/03 ; START OF ACTUAL CONVERSIONS ; control characters [CTL] <> [CTL] ; preserve standard format markers of backslash followed by string of alphanumeric ASCII characters ;'\' / _ [anum] <> reverse_solidus / _ [anum] ;[anum] / '\' [anum]* _ <> [anum] / reverse_solidus [anum]* _ ;compiler rejects quoted strings so change to 92 / _ [anum] <> reverse_solidus / _ [anum] [anum] / 92 [anum]* _ <> [anum] / reverse_solidus [anum]* _ ; Cantillation marks 161 <> U+0591 ; CCAT 92 Hebrew accent ETNAHTA 242 > U+0591 ; ditto DE 253 > U+0591 ; ditto DE 130 <> U+0592 ; CCAT 01 Hebrew accent (SEGOL) SEGOLTA (postpositive) 177 > U+0592 ; ditto DE 142 <> U+0593 ; CCAT 65 Hebrew accent SHALSHELET 082 > U+0593 ; ditto DE 129 <> U+0594 ; CCAT 80 Hebrew accent ZAQEF QATAN 175 > U+0594 ; ditto DE 136 <> U+0595 ; CCAT 85 Hebrew accent ZAQEF GADOL 068 > U+0595 ; ditto DE 155 <> U+0596 ; CCAT 73 Hebrew accent TIPEHA = tarha 089 > U+0596 ; ditto DE 247 > U+0596 ; ditto DE ; Same SIL codes as DEHI, TIPEHA is not word initial ; In word initial environment 155, 089, 247 > DEHI, see below 128 <> U+0597 ; CCAT 81 Hebrew accent REVIA 174 > U+0597 ; ditto DE 141 <> U+0598 ; CCAT 82 Hebrew accent ZARQA = zinorit 081 > U+0598 ; ditto DE 143 <> U+0599 ; CCAT 03,33 Hebrew accent PASHTA (postpositive) (left) ; CCAT 03 is word final; CCAT 33 is not word final but to left of letter *** ; except when followed by holem, when it centers. jw 3/12/03 ;[holem] 143 / _ ^039 > U+05B9 U+05A8 ; any holem with 33, not followed by aleph (039) goes from left to medial ;108 [dr_de]?=c [vow_de]?=b 143 / _ [allcons] > U+05DC [dr_de]?=c [vow_de]?=b U+05A8 ; lamed with 33, not word-final 12/10/03 changed order of dagesh, both sides jw ;108 [dr_de]?=c [vow_de]?=b 143 / _ ([allcons] | [hiriq]) > U+05DC [dr_de]?=c [vow_de]?=b U+05A8 ; lamed with 33, not word-final 12/10/03 changed order of dagesh, both sides, corrected context ; of / _ [allcons]. This rule needs to also include all forms of LAIM, without picking up wordfinal 03. jw ; this special code for pashta/azla is not needed becasue it is now handled in the font, so it is commented out cjs March 2007 ; trying to get pashta/azla to match better on return... ; For this pair, I will just convert back to shape, not original encoding. Getting back to the original isn't possible, ; without determining every possible context, and there may not be a general context anyway. If I send them back ; to the correct shape, at least the text should look correct, even though the encoding is not identical. 9/22/03 jw ;Change to 05A8 when sin dot collision forces to medial version 05A8 jw 4/28/03, changed order of dagesh 10/6/03 jw ;087 [dr_de]?=c [vow_de]?=b 143 / _ [allcons] > U+05E9 U+05C2 [dr_de]?=c [vow_de]?=b U+05A8 ;sin w/dot and 33, also not word-final jw ;152 <> U+059A ; CCAT 10 Hebrew accent YETIV (prepositive);uncommented 8/21/03 recom 8/22/03 ; Same SIL code as MAHAPAKH, YETIV is word initial ; Converted by special prepositive conversion code below 153 <> U+059B ; CCAT 91 Hebrew accent TEVIR 095 > U+059B ; ditto DE 138 <> U+059C ; CCAT 61 Hebrew accent GERESH 077 > U+059C ; ditto DE ; Same SIL codes as GERESH MUQDAM, GERESH is not word initial ; In word initial environment 138, 077 > GERESH MUQDAM, see below ;138 <> U+059D ; CCAT 11 Hebrew accent GERESH MUQDAM (prepositive)uncom 8/21/03 recom 8/22/03 ;077 > U+059D ; ditto DE uncom 8/21/03 ; Same SIL codes as GERESH, GERESH MUQDAM is word initial ; Converted by special prepositive conversion code below 139 <> U+059E ; CCAT 62 Hebrew accent GERSHAYIM 085 > U+059E ; ditto DE 133 <> U+059F ; CCAT 84 Hebrew accent QARNEY PARA 185 > U+059F ; ditto DE 132 <> U+05A0 ; CCAT 14,44 Hebrew accent TELISHA GEDOLA (prepositive) uncom 8/21/03 recom 8/22/03 ;184 > U+05A0 ; ditto DE for consistency uncom 9/18/03 need this for Gen 5:29:5 zeh with medial prepos. jw Okay commented 10/6/03 ; CCAT 14 is usually word initial; CCAT 44 is not word initial, occurs Gen 5:29:5, ; Ezra 5:17, Esth 6:13 jw 3/18/03 ; *** See also gen 7:7:2nd line ; Converted by special prepositive conversion code below 135 <> U+05A1 ; CCAT 83 Hebrew accent PAZER 188 > U+05A1 ; ditto DE 158 <> U+05A3 ; CCAT 74 Hebrew accent MUNAH 090 > U+05A3 ; ditto DE 249 > U+05A3 ; ditto DE 152 / [OOB] _ > U+059A ;8/22/03 This code determines which YETIB to use. Keep. 152 / # _ > U+059A ;8/22/03 no DE should be coming through at this point. 152 <> U+05A4 ; CCAT 70 Hebrew accent MAHAPAKH medial version 096 > U+05A4 ; ditto DE medial version ; Same SIL codes as YETIV, MAHAPAKH is not word initial ; In word initial environment 152, 096 > YETIV, see above 156 <> U+05A5 ; CCAT 71 Hebrew accent MERKHA = yored 066 > U+05A5 ; ditto DE 245 > U+05A5 ; ditto DE 157 <> U+05A6 ; CCAT 72 Hebrew accent MERKHA KEFULA 084 > U+05A6 ; ditto DE 159 <> U+05A7 ; CCAT 94 Hebrew accent DARGA 094 > U+05A7 ; ditto DE 250 > U+05A7 ; ditto DE 137 <> U+05A8 ; CCAT 63 Hebrew accent QADMA = azla medial 061 > U+05A8 ; ditto DE ; CCAT 04 is word final; CCAT 24 (rare) is not word final but to left of letter *** 131 / _ [OOB] <> U+05A9 ; CCAT 04 Hebrew accent TELISHA QETANA (postpositive) 131 <> U+05A9 ZWNJ ; CCAT 24 Hebrew accent TELISHA QETANA (non-postpositive) 131 / 79 _ 39 <> U+05A9 ;exception for Est 6.13.11 to allow holem to move on to the aleph 182 / _ [OOB] <> U+05A9 ; ditto DE 182 <> U+05A9 ZWNJ ; 154 <> U+05AA ; CCAT 93 Hebrew accent YERAH BEN YOMO = galgal 106 > U+05AA ; ditto DE ; new character 05A2 equivalent to 05AA, not used in BHS 154 < U+05A2 134 <> U+05AB ; CCAT 60 Hebrew accent OLE 186 > U+05AB ; ditto DE 140 <> U+05AC ; CCAT 64 Hebrew accent ILUY 080 > U+05AC ; ditto DE ;155 <> U+05AD ; CCAT 13 Hebrew accent DEHI (prepositive) uncom 8/21/03 recom 8/22/03 ; Same SIL code as TIPEHA, DEHI is word initial ; Converted by special prepositive conversion code below 144 <> U+05AE ; CCAT 02 Hebrew accent ZINOR (postpositive) 086 > U+05AE ; ditto DE 126 <> U+05AF ; Hebrew mark MASORA CIRCLE ; 7/30/03 ; TEST rearrangement back for mixed high/low marks. ; worked above Exod20:3:2, not below or above for 20:2:3 7/31/03 ; these 2 examples are in conflict. One needs to be converted, but not the other. ; Both are LOLOHI in Unicode at present. Note this is Reverse direction. ;[vow_se]=a [high_p_se]=c [low_p_se]=b < [vow_se]=a [low_p_se]=b [high_p_se]=c ;[vow_se]=a [low_p_se]=b [high_p_se]=c < [vow_se]=a [high_p_se]=c [low_p_se]=b ;****************************************************************************************************** ;WORKING AREA 8/14/03 ; holemvav with preceding cants OK 8/14/03 ; divine name OK as is 8/15/03 ; legal second vowels: hiriq,sheva,patah-only with LAIM OK ; metegs ; metegs precedence over standard problem OK ; paired cants which change encoding 33-03 OK, not sure about others. ;****************************************************************************************************** ;Finals processing and rearrangement - 3 possible environments 8/12/03 ;Ezra standard encoding contains no finals. ;Unicode final form always mapped to Ezra standard form, final forms are DE only ;[nonf] / _ [point]* [WSP] > [final] ;[nonf] / _ [point]* # > [final] ;[nonf] / _ [point]* [num]+ > [final] ;jw 3/13/03 OOB now catches finals preceding verse numbers with ;fourth possibility cjs April 2006 ;[nonf] / [WSP] _ [point]* [WSP] > [nonf] ; no whitespace. See Exod 20:4:final, Exod 20:9:final. ;This is WORD-FINAL environment. (This should not hit if nonf followed by 226-ZWL, which is what we want for preventing finals processing.) ;no return needed. Finals are not part of SE encoding. 9/11/03 ;in decalogue final kaph is followed by two high cants so change to {0,2} cjs April 2006 [nonf]=b / _ [dr_se]{0,2} [vow_se]? [low_p_se]? [high_p_se]{0,2} [low_p_se]? ( [OOB] | # ) <> [final]=b ;to protect the free form of preposition kaph, added by cjs April 2006, and also pethuhah, modified by cjs Feb 2007 [nonf]=b / ( [OOB] | # ) _ [dr_se]{0,2} [vow_se]? [low_p_se]? [high_p_se]{0,2} [low_p_se]? ( [OOB] | # ) <> [nonf]=b ;This is WORD-INITIAL environment. ;is return needed? yes 9/12/03 [pp_se]=a / [OOB] _ <> [pp_se]=a / [OOB] _ ;8/25/03, 9/11/03 jw [pp_se]=a / # _ <> [pp_se]=a / # _ ;8/25/03, 9/11/03 jw ;NOTE pp_se MUST be used with WhiteSpace/OOB environments. Otherwise, you will have marks identified ;as word-initial which are not. We use the same mark in SE for 2 items in Unicode. Example: 05AD 0596 ;This is also very tricky. In the case where you might have a hilo or lohi cant combination, you want to catch both. Two lines of code won't work, because they would be the same length and ? matches zero. Therefore, both lohi and hilo would activate the original command below. NEXT LINE COMMENTED on purpose. ;[pp_se]=a? [cons]=b [dr_se]=c? [vow_se]=f? [low_p_se]=g? [high_p_se]=h? > [cons]=b [dr_se]=c [vow_se]=f [low_p_se]=g [pp_se]=a [high_p_se]=h ;Basically, in a hilo situation, this command would assume the first lo was optional, run the command, and drop out for the final lo mark. To get around this, I've combined the two commands by saying optional lohilo. This seems to work. See ;nonf processing above and elsewhere. ;****************************************************************************************************** ; Points and punctuation 252 <> U+05B0 ; CCAT : Hebrew point SHEVA 204 > U+05B0 ; ditto DE 217 > U+05B0 ; ditto DE 232 > U+05B0 ; ditto DE 233 <> U+05B1 ; CCAT :E Hebrew point HATAF SEGOL 206 > U+05B1 ; ditto DE 219 > U+05B1 ; ditto DE 239 > U+05B1 ; ditto DE 225 <> U+05B2 ; CCAT :A Hebrew point HATAF PATAH 207 > U+05B2 ; ditto DE 220 > U+05B2 ; ditto DE 240 > U+05B2 ; ditto DE 243 <> U+05B3 ; CCAT :F Hebrew point HATAF QAMATS 221 > U+05B3 ; ditto DE 236 > U+05B3 ; ditto DE 241 > U+05B3 ; ditto DE 105 <> U+05B4 ; CCAT I Hebrew point HIRIQ 199 > U+05B4 ; ditto DE 214 > U+05B4 ; ditto DE 230 > U+05B4 ; ditto DE 069 <> U+05B5 ; CCAT " Hebrew point TSERE 197 > U+05B5 ; ditto DE 212 > U+05B5 ; ditto DE 227 > U+05B5 ; ditto DE 101 <> U+05B6 ; CCAT E Hebrew point SEGOL 198 > U+05B6 ; ditto DE 213 > U+05B6 ; ditto DE 229 > U+05B6 ; ditto DE 097 <> U+05B7 ; CCAT A Hebrew point PATAH 194 > U+05B7 ; ditto DE 209 > U+05B7 ; ditto DE 222 > U+05B7 ; ditto DE 065 <> U+05B8 ; CCAT F Hebrew point QAMATS 195 > U+05B8 ; ditto DE 210 > U+05B8 ; ditto DE 223 > U+05B8 ; ditto DE ; new character 05C7 equivalent to 05B8, not used in BHS 065 < U+05C7 079 <> U+05B9 ; CCAT O Hebrew point HOLEM 200 > U+05B9 ; ditto DE 215 > U+05B9 ; ditto DE 117 <> U+05BB ; CCAT U Hebrew point QUBUTS 201 > U+05BB ; ditto DE 216 > U+05BB ; ditto DE 231 > U+05BB ; ditto DE 208 <> U+05BC ; CCAT . Hebrew point DAGESH or MAPIQ = shuruq 067 > U+05BC ; ditto DE 070 > U+05BC ; ditto DE 071 > U+05BC ; ditto DE 073 > U+05BC ; ditto DE 074 > U+05BC ; ditto DE 075 > U+05BC ; ditto DE 076 > U+05BC ; ditto DE 183 > U+05BC ; ditto DE 149 <> U+05BD ; CCAT 35 (with hataf vowels) ; CCAT 75 (default) ; CCAT 95 (word-initial) ;jw modified note from "prepositive" ; Hebrew point METEG = siluq 189 > U+05BD ; ditto DE 190 > U+05BD ; ditto DE 191 > U+05BD ; ditto DE 192 > U+05BD ; ditto DE 193 > U+05BD ; ditto DE ; special processing BACK of word-initial meteg, similar to that of prepositive accents above ;jw modified meteg notes. Meteg is never word-initial and does not even precede holem, (exc. in 11 occs of 95-O) ; MC encoding for regular meteg is 75. ; MC encoding for right meteg is 95. Right meteg is accomplished by placing right meteg before the low vowel. Left meteg on ; hatafs must be CGJ followed by regular meteg. 7/24/03 jw ;Left meteg was not encodable after hatafs in SE. It is placed on left ;but will be converted to central (regular) meteg if converted to DE jw 8/5/03 149 < CGJ U+05BD ; jw 8/12/03 LEFT METEG NEW ORDER 149 < U+F303 ; jw 8/5/03 old PUA LEFT METEG 149 < U+F302 ; jw 10/6/03 old PUA RIGHT METEG 045 <> U+05BE ; CCAT - Hebrew punctuation MAQAF 038 <> U+05BF ; CCAT , Hebrew point RAFE 124 <> U+00A0 U+05C0 ; CCAT 05 Hebrew punctuation PASEQ = legarmeh jw added NBSP for better spacing 3/31/03 ; <> U+05C1 ; Hebrew point SHIN DOT ; <> U+05C2 ; Hebrew point SIN DOT 058 <> U+05C3 ; CCAT 00 Hebrew punctuation SOF PASUQ 178 <> U+05C4 ; CCAT 52 Hebrew mark UPPER DOT 05C4(changed to 0307 8/15/03, changed back 9/9/03)jw 178 < U+0307 ; CCAT 52 Hebrew mark UPPER DOT 05C4(changed to 0307 8/15/03, changed back 9/9/03)jw ; Based on ISO 8859-8 039 <> U+05D0 ; CCAT ) Hebrew letter ALEF = aleph 098 <> U+05D1 ; CCAT B Hebrew letter BET 103 <> U+05D2 ; CCAT G Hebrew letter GIMEL 100 <> U+05D3 ; CCAT D Hebrew letter DALET 104 <> U+05D4 ; CCAT H Hebrew letter HE 119 <> U+05D5 ; CCAT W Hebrew letter VAV 122 <> U+05D6 ; CCAT Z Hebrew letter ZAYIN 120 <> U+05D7 ; CCAT X Hebrew letter HET 088 <> U+05D8 ; CCAT + Hebrew letter TET 121 <> U+05D9 ; CCAT Y Hebrew letter YOD 162 > U+05DA ; CCAT K Hebrew letter FINAL KAF 107 < U+05DA ; CCAT K Hebrew letter FINAL KAF 7/23/03 jw ; see code below for processing of final forms 107 <> U+05DB ; CCAT K Hebrew letter KAF 108 <> U+05DC ; CCAT L Hebrew letter LAMED 163 > U+05DD ; CCAT M Hebrew letter FINAL MEM 109 < U+05DD ; CCAT M Hebrew letter FINAL MEM 7/23/03 jw ; see code below for processing of final forms 109 <> U+05DE ; CCAT M Hebrew letter MEM 164 > U+05DF ; CCAT N Hebrew letter FINAL NUN 110 < U+05DF ; CCAT N Hebrew letter FINAL NUN 7/23/03 jw ; see code below for processing of final forms 110 <> U+05E0 ; CCAT N Hebrew letter NUN ;115 <> U+05E1 ; CCAT S Hebrew letter SAMEKH moved 7/23/03 jw 118 <> U+05E2 ; CCAT ( Hebrew letter AYIN 165 > U+05E3 ; CCAT P Hebrew letter FINAL PE 112 < U+05E3 ; CCAT P Hebrew letter FINAL PE 7/23/03 jw ; see code below for processing of final forms ;112 <> U+05E4 ; CCAT P Hebrew letter PE moved 7/23/03 jw 166 > U+05E5 ; CCAT C Hebrew letter FINAL TSADI 099 < U+05E5 ; CCAT C Hebrew letter FINAL TSADI 7/23/03 jw ; see code below for processing of final forms 099 <> U+05E6 ; CCAT C Hebrew letter TSADI = zade 113 <> U+05E7 ; CCAT Q Hebrew letter QOF 114 <> U+05E8 ; CCAT R Hebrew letter RESH 083 <> U+05E9 ; CCAT # Hebrew letter SHIN (unpointed) 072 <> U+05E9 U+05C1; Hebrew letter SHIN with SHIN DOT jw 087 <> U+05E9 U+05C2; Hebrew letter SHIN with SIN DOT jw 116 <> U+05EA ; CCAT T Hebrew letter TAV ; Yiddish digraphs 119 119 < U+05F0 ; CCAT WW Hebrew ligature YIDDISH DOUBLE VAV = tsvey vovn ; (not used in Hebrew) 119 121 < U+05F1 ; CCAT WY Hebrew ligature YIDDISH VAV YOD ; (not used in Hebrew) 121 121 < U+05F2 ; CCAT YY Hebrew ligature YIDDISH DOUBLE YOD = tsvey yudn ; (not used in Hebrew) ; Additional punctuation 035 <> U+05F3 ; Hebrew punctuation GERESH 034 <> U+05F4 ; Hebrew punctuation GERSHAYIM ; General punctuation: Bidirectional control characters 226 < U+200C ; ZERO WIDTH NON-JOINER 226 < U+200D ; ZERO WIDTH JOINER ; 226 > U+200D is used to disable switch to final forms, so not word breaking ; 226 ZWL ZeroWidthLetter prevents finals processing on consonants KMNCP 226 > ;Delete ZERO WIDTH LETTER, no longer needed in Unicode data ;since finals are separate encoding. Any final followed by 226 should go to normal ;nonfinal form. This would normally only occur in isolation. 8/18/03 jw ; Hebrew presentation forms 121 105 < U+FB1D ; CCAT YI Hebrew letter YOD with HIRIQ =05D9+05B4 ; <> U+FB1E ; Hebrew point JUDEO-SPANISH VARIKA ~0306 121 121 097 < U+FB1F ; CCAT YYA Hebrew ligature YIDDISH YOD YOD PATAH =05F2+05B7 118 < U+FB20 ; CCAT ( Hebrew letter ALTERNATIVE AYIN ~05E2 039 < U+FB21 ; CCAT ) Hebrew letter WIDE ALEF ~05D0 100 < U+FB22 ; CCAT D Hebrew letter WIDE DALET ~05D3 104 < U+FB23 ; CCAT H Hebrew letter WIDE HE ~05D4 107 < U+FB24 ; CCAT K Hebrew letter WIDE KAF ~05DB 108 < U+FB25 ; CCAT L Hebrew letter WIDE LAMED ~05DC 109 < U+FB26 ; CCAT M Hebrew letter WIDE FINAL MEM ~05DD 114 < U+FB27 ; CCAT R Hebrew letter WIDE RESH ~05E8 116 < U+FB28 ; CCAT T Hebrew letter WIDE TAV ~05EA ; <> U+FB29 ; Hebrew letter ALTERNATIVE PLUS SIGN ~002B 072 < U+FB2A ; CCAT $ Hebrew letter SHIN with SHIN DOT =05E9+05C1 087 < U+FB2B ; CCAT & Hebrew letter SHIN with SIN DOT =05E9+05C2 072 208 < U+FB2C ; CCAT $. Hebrew letter SHIN with DAGESH AND SHIN DOT=FB49+05C1 087 208 < U+FB2D ; CCAT &. Hebrew letter SHIN with DAGESH AND SIN DOT=FB49+05C2 039 097 < U+FB2E ; CCAT )A Hebrew letter ALEF with PATAH =05D0+05B7 039 065 < U+FB2F ; CCAT )F Hebrew letter ALEF with QAMATS =05D0+05B8 039 208 < U+FB30 ; CCAT ). Hebrew letter ALEF with MAPIQ =05D0+05BC 098 208 < U+FB31 ; CCAT B. Hebrew letter BET with DAGESH =05D1+05BC 103 208 < U+FB32 ; CCAT G. Hebrew letter GIMEL with DAGESH =05D2+05BC 100 208 < U+FB33 ; CCAT D. Hebrew letter DALET with DAGESH =05D3+05BC 104 208 < U+FB34 ; CCAT H. Hebrew letter HE with MAPIQ =05D4+05BC 251 < U+FB35 ; CCAT W. Hebrew letter VAV with DAGESH =05D5+05BC ;SHUREQ processing 119 208 [any_p_se]=a 251 < U+05D5 U+05BC [any_p_se]=a U+05D5 U+05BC ;9/26/03 try this. keep. ;jk SEQUENCE HERE! works UniClass[any_p_or_vow] = ( [any_p_se] [vow_se] ) 251 <> U+05D5 U+05BC / _ ( [any_p_se] ^[vow_se] | ^[any_p_or_vow] ) 119 208 251 < U+05D5 U+05BC U+05D5 U+05BC/ [vow_se] _ ;HOLAM/WAW and WAW/HOLAM procesing ;change vocalic holam waw to waw-holam order and move cants 079 [any_p_se]?=b [any_p_se]?=c 119 / [allcons] [dr_se]? _ > [any_p_se]?=b [any_p_se]?=c U+05D5 U+05B9 ;this isn't called after lamed if cant = U+05A8, because of changes above ;corrected in a later pass below ;implementation of 05BA = holam haser for consonantal WO 119 [dr_se]? [any_p_se]{0,2} 079 <> U+05D5 [dr_se]? [any_p_se]{0,2} U+05BA ;where there is consonantal waw followed by plene holam 119=a [dr_se]?=b [any_p_se]?=f 079=c [any_p_se]?=d 119=e <> U+05D5=a [dr_se]?=b [any_p_se]?=d [any_p_se]?=f U+05D5=e U+05B9=c 119=a [dr_se]?=b [any_p_se]?=d 244 > U+05D5=a [dr_se]?=b [any_p_se]?=d U+05D5 U+05B9 ;where holam waw is followed by a vowel, waw is consonantal and the holam belongs to the preceding consonant 079 [any_p_se]?=b [any_p_se]?=c 119 [vow_se]=d <> [any_p_se]?=b [any_p_se]?=c U+05B9 U+05D5 [vow_se]=d ;RETURN TRIP ;needs a return trip for the w/o order in old text [any_p_se]?=a [any_p_se]?=b 244 < U+05B9 [any_p_se]?=a [any_p_se]?=b U+05D5 ;old vocalic OW ;but also need to distinguish return trip from old and new text with U+05D5 U+05B9 order 244 < U+05D5 U+05B9 / [cons] [shindots]? [dr_se]? [any_p_se]{0,2} _ ;new vocalic OW 119 [dr_se]? 079 < U+05D5 [dr_se]? U+05B9 / [vow_se] [any_p_se]{0,2} _ ;old consonantal WO ;special case (Dan 5.6.3, 5.9.6) where the preceding vowel is plene hireq 119 [dr_se]? 079 < U+05D5 [dr_se]? U+05B9 / U+05D6 U+05B4 [any_p_se]{0,2} U+05D9 [any_p_se]{0,2} _ ;old consonantal WO ;special case (Jer 52.19.7) where the yodh is consonantal and the waw vocalic 244 < U+05D5 U+05B9 / U+05E7 U+05B4 U+05D9 U+0594 _ ;new vocalic OW 122 208 < U+FB36 ; CCAT Z. Hebrew letter ZAYIN with DAGESH =05D6+05BC 088 208 < U+FB38 ; CCAT +. Hebrew letter TET with DAGESH =05D8+05BC 121 208 < U+FB39 ; CCAT Y. Hebrew letter YOD with DAGESH =05D9+05BC 107 208 < U+FB3A ; CCAT K. Hebrew letter FINAL KAF with DAGESH =05DA+05BC 172 > U+05DA U+05BC; ditto DE 107 208 < U+FB3B ; CCAT K. Hebrew letter KAF with DAGESH =05DB+05BC 108 208 < U+FB3C ; CCAT L. Hebrew letter LAMED with DAGESH =05DC+05BC 109 208 < U+FB3E ; CCAT M. Hebrew letter MEM with DAGESH =05DE+05BC 110 208 < U+FB40 ; CCAT N. Hebrew letter NUN with DAGESH =05E0+05BC 115 208 < U+FB41 ; CCAT S. Hebrew letter SAMEKH with DAGESH =05E1+05BC 112 208 < U+FB43 ; CCAT P. Hebrew letter FINAL PE with DAGESH =05E3+05BC 169 > U+05E3 U+05BC; ditto DE 112 208 < U+FB44 ; CCAT P. Hebrew letter PE with DAGESH =05E4+05BC 099 208 < U+FB46 ; CCAT C. Hebrew letter TSADI with DAGESH =05E6+05BC 113 208 < U+FB47 ; CCAT Q. Hebrew letter QOF with DAGESH =05E7+05BC 114 208 < U+FB48 ; CCAT R. Hebrew letter RESH with DAGESH =05E8+05BC 083 208 < U+FB49 ; CCAT #. Hebrew letter SHIN with DAGESH =05E9+05BC 116 208 < U+FB4A ; CCAT T. Hebrew letter TAV with DAGESH =05EA+05BC 119 079 < U+FB4B ; CCAT WO Hebrew letter VAV with holem =05D5+05B9 098 038 < U+FB4C ; CCAT B, Hebrew letter BET with RAFE =05D1+05BF 107 038 < U+FB4D ; CCAT K, Hebrew letter KAF with RAFE =05DB+05BF 112 038 < U+FB4E ; CCAT P, Hebrew letter PE with RAFE =05E4+05BF 039 108 < U+FB4F ; CCAT )L Hebrew ligature ALEF LAMED =05D0+05DC ; (not used in Biblical Hebrew) ; SIL Ezra characters not represented in Unicode ;036 <> U+0306 ; (CCAT 52) Hebrew mark NUMBER (incorrectly used for 05C4) ; *** temporary replacement for the above for processing miscoded BHS text *** ; *** note that Ezra 036 is never used for NUMBER in actual Biblical Hebrew 036 <> U+0307 ; Hebrew mark NUMBER-single dot 037 <> U+0308 ; Hebrew mark NUMBER-double dot NEW ORDER 043 > U+002A ; Combining ASTERISK ABOVE - change to reg. asterisk 8/15/03 jw ; note that setuma & petuha codes should appear only in running text with actual whitespace on at least one side 060 / [WS] _ [WS] <> U+05E1 / [WS] _ [WS] ;original code left out setuma/petuha with no whitespace after (Was 00A7) 060 / [WS] _ [num] <> U+05E1 / [WS] _ [num]; CCAT S Hebrew punctuation SETUMA 060 / [WS] _ # <> U+05E1 / [WS] _ # ; CCAT S Hebrew punctuation SETUMA 060 / # _ [WS] <> U+05E1 / # _ [WS] ; CCAT S Hebrew punctuation SETUMA 062 / [WS] _ [WS] <> U+05E4 / [WS] _ [WS] ;original code left out setuma/petuha with no whitespace after. (Was 00B6)jw 062 / [WS] _ [num] <> U+05E4 / [WS] _ [num]; CCAT S Hebrew punctuation PETUHA 062 / [WS] _ # <> U+05E4 / [WS] _ # ; CCAT S Hebrew punctuation PETUHA 062 / # _ [WS] <> U+05E4 / # _ [WS] ; CCAT S Hebrew punctuation PETUHA 115 <> U+05E1 ; CCAT S Hebrew letter SAMEKH 112 <> U+05E4 ; CCAT P Hebrew letter PE 064 <> U+25CC ; PLACEHOLDER 078 < U+05E0 U+034F U+0307 ; CCAT N]8 Hebrew letter INVERTED NUN jw changed to PUA F300, changed to seq 8/15/03 ; changed back to 0307 9/9/03 jw ; changed to (Unicode 4.1) 078 <> U+05C6 U+0307 ; U+05C6 added in Unicode 4.1 cjs April 2006 U+034F deleted cjs March 2007 102 <> U+05B1 ZWJ U+05BD; CCAT :E35 Hebrew point HATAF SEGOL with METEG - cjs added ZWJ 102 < U+05B1 U+05BD; CCAT :E35 Hebrew point HATAF SEGOL with METEG 111 > U+05B8 ; CCAT F Hebrew point QAMATS HATUF (kamats-o) 196 > U+05B8 ; ditto DE 211 > U+05B8 ; ditto DE 224 > U+05B8 ; ditto DE 167 > U+05DA U+05B8; CCAT KF Hebrew letter FINAL KAF with QAMATS 168 > U+05DF U+05B8; CCAT NF Hebrew letter FINAL NUN with QAMATS 170 > U+05DA U+05B0; CCAT K: Hebrew letter FINAL KAF with SHEVA ;173 > U+FB3A U+05B8; CCAT K.F Hebrew letter FINAL KAF with DAGESH and QAMATS 173 > U+05DA U+05BC U+05B8; CCAT K.F Hebrew letter FINAL KAF with DAGESH and QAMATS jw changed from FB3A 05B8 179 <> U+05C5 ; CCAT 53 Hebrew mark LOWER DOT ;cjs Unicode 4.1 added U+05C5 179 < U+0323 ; CCAT 53 Hebrew mark LOWER DOT ;jw 7/24/03 don't use F301 any longer 179 < U+F301 ; CCAT 53 Hebrew mark LOWER DOT ;jw changed to PUA 180 <> U+05AB ; CCAT ^ Hebrew ACCENT 181 > U+05AB ; ditto DE 202 > U+05B5 U+05D9; CCAT "Y Hebrew letter TSERE with YOD 203 > U+05B5 U+05D4; CCAT "H Hebrew letter TSERE with HE 228 > U+05B8 U+05D4; CCAT FH Hebrew letter QAMATS with HE 234 > U+05B6 U+05D9; CCAT EY Hebrew letter SEGOL with YOD 235 > U+05B6 U+05D4; CCAT EH Hebrew letter SEGOL with HE 238 > U+05B4 U+05D9; CCAT IY Hebrew letter HIRIQ with YOD 244 <> U+05B9 U+05D5; CCAT OW Hebrew letter holem VAV (holem TO RIGHT); return conversion with ;intervening cants is handled in ASCII pass. 9/12/03 jw 246 > U+05B9 U+05D4; CCAT OH Hebrew letter holem with HE 248 > U+05B0 ; CCAT : Hebrew point SILENT SHEVA 205 > U+05B0 ; ditto DE 218 > U+05B0 ; ditto DE 237 > U+05B0 ; ditto DE 254 <> U+05B2 ZWJ U+05BD; CCAT :A35 Hebrew point HATAF PATAH with METEG - cjs added ZWJ 254 < U+05B2 U+05BD; CCAT :A35 Hebrew point HATAF PATAH with METEG 255 > ; THIN SPACE - omitted as used only for DE spacing ; miscellaneous characters in SIL Hebrew, not converted ; checked that all characters 0-255 have been included PK 1/12/2000 032 <> space 033 <> exclamation_mark 040 <> U+0028 ;jw was right_parenthesis 041 <> U+0029 ;jw was left_parenthesis 042 <> asterisk 044 <> comma 046 <> full_stop 047 <> solidus 048 <> digit_zero 049 <> digit_one 050 <> digit_two 051 <> digit_three 052 <> digit_four 053 <> digit_five 054 <> digit_six 055 <> digit_seven 056 <> digit_eight 057 <> digit_nine 059 <> semicolon 063 <> question_mark 091 <> U+005B ; jw from right_square_bracket Note: word-initial paired items 092 <> reverse_solidus ;(backslash) ;will not display in Ezra SIL in Word 2002 093 <> U+005D ; jw from left_square_bracket 123 <> U+007B ; jw from right_curly_bracket 125 <> U+007D; jw from left_curly_bracket 145 <> U+2019 ; RIGHT SINGLE QUOTATION MARK 146 <> U+2018 ; LEFT SINGLE QUOTATION MARK 147 <> U+201E ; DOUBLE LOW-9 QUOTATION MARK 148 <> U+201C ; jw from 0201F DOUBLE HIGH-REVERSED-9 QUOTATION MARK 150 <> U+2013 ; EN DASH 151 <> U+2014 ; EM DASH 160 <> U+00A0 ; NO-BREAK SPACE ;jw this item is Reserved in font 171 <> U+00AB ; jw was 00BB RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 176 <> U+00A0 ; NO-BREAK SPACE 187 <> U+00BB ; jw was 00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK ;special case at Jdg 19.13.3 - seems to have the wrong vowel-cant order in WLC 108 252 107 156 65 > U+05DC U+05B0 U+05DA U+05B8 U+05A5 ;special case at 2Sa 3.8.11 added cjs Oct 2007 ;BHS shows the cant to the left of the holem, so use U+05AE and insert ZWNJ - extend the context if necessary 110 79 141 107 > U+05E0 U+05B9 U+05AE ZWNJ U+05DB ;special case at 2Sa 3.8.11 added cjs Oct 2007 ;BHS shows the cant to the right, so insert ZWNJ ;also move the word-initial telisha magnum (MC 14) at this stage to avoid a match in the [pp] rearrangement ;note that shureq is already decomposed 251 > 119 208 132 119 208 109 105 132 <> U+05D5 U+05BC U+05A0 ZWNJ U+05DE U+05B4 U+05A0 ;This pass does final rearrangment to NEW ORDER. Also lots of documentation here. pass (Unicode) ; ********************************************************************************** ; TIPS ; ********************************************************************************** ; / follow this with pre-context _ post-context ; | match either preceding or following context ; ? match preceding item 0 or 1 times (optional single) ; * match preceding item 0 to 15 times (optional multiple, deleting all but first) ; + match preceding item 1 to 15 times (one occurrence required) ; {0,2} means 0-2 times ; pp=pre-positive cant ; p = point cant (no vowel) ; post=post-positive cant ; dr=dagesh/rafe ; vow = vowel Class [pp] = ( U+059D U+05A0 U+059A U+05AD ) Class [high_p] = ( U+0593 U+0594 U+0595 U+0597 U+0598 U+0599 U+059C U+059E U+059F \ U+05A1 U+05A8 U+05A9 U+05AB U+05AC U+05AE U+05C4 U+0307 U+0308 ) ;added last 3 9/9/03 jw, added the [hipost] cjs May 2006 Class [low_p] = ( U+0591 U+0596 U+059B U+05A3 U+05A4 U+05A5 U+05A6 U+05A7 \ U+05AA U+05BD U+05C5 U+05A2 ) ;cjs added U+05C5 and U+05A2 Class [hipost] = ( U+0592 U+0599 U+05A9 U+05AE ) ;cjs added U+0592 Class [cons] = ( U+05D0 U+05D1 U+05D2 U+05D3 U+05D4 U+05D5 U+05D6 U+05D7 \ U+05D8 U+05D9 U+05DB U+05DC U+05DE U+05E0 U+05E1 U+05E2 \ U+05E4 U+05E6 U+05E7 U+05E8 U+05E9 U+05EA ) Class [dr] = ( U+05BC U+05BF ) Class [vow] = ( U+05B0 .. U+05BB U+05C7 ) Class [lo_vow] = ( U+05B0 .. U+05B8 U+05BB U+05C7 ) Class [shindots] = ( U+05C1 U+05C2 ) ;The purpose of this rearrangement is to meet Unicode requirements of word-initial marks following the base ;consonant and vowel. The second purpose is to place the data into the NEW ORDER, decided by the font designer ;group of Microsoft, Antioch, Tiro Typeworks, SBL, and SIL in May 2003. ;The SIL Ezra (old) order is: ;[pre-positive] [consonant] [shin/sin dots][dagesh][rafe] [right meteg] [vowel] [cants] ([2nd vowel,cants])[low post-positive] ;The NEW ORDER is: ;[consonant] [shin/sin dot] [dagesh][rafe] [holem] [right meteg] [low vowels] [low cantillations] (low 2nd vowel) ;[low pre-positive] [high pre-positive] [hi cant or dots] [low post-positive] (ZWNJ+any mark) ;Things to note about this order: ;1) No cantillation mark ever comes before a vowel, except the very exceptional right meteg. We hope to have a Unicode ;character assigned for it in the future. It will probably then be moved to the low cantillations group. ;2) All low marks are contiguous. All high marks (exc. holem) are contiguous, (since there are no low post-postives.) ;3) Normal order then is consonant-vowel-low cants-high cants. This is generally the same order as MC and SIL Ezra, ;although it was not unusual to have high cants then low cants or other mixed groupings. ;4) The NEW ORDER is not canononical order or NFC/NFD order. Data which is in these orders will likely not display ;properly with any of the group's fonts. ;5) Any mark which must occur out of order in order to get proper rendering, such as left meteg on hataf, ;etc. should occur with ZWNJ, ZWJ, or CGJ and be last, where possible. See below: ; = variable rendering depending on font (medial meteg in ;SBL Hebrew, Vusillus, etc; left meteg in some other fonts) ; = always medial ligated form ; = always left meteg (post hataf) ; = always right meteg (pre hataf) ;This does mean, of course, that if you can't reliably determine what font ;will be used to display the text, e.g. in web publishing, you are going to ;want to use ZWJ and ZWNJ in every case and not rely on the default ;rendering of particular fonts for . ;Another way to look at it: ;ZWJ - for circumstances in which actual ligation occurs at the glyph level. ;ZWNJ - for prevention of ligation. ;CGJ - for controlling mark ordering. ;These characters are available on the Ezra keyboards, but may or may not be accessible in Word. ;They can be added with Unicode macros, if necessary. ;**************** START OF REARRANGEMENT **************************************** ; OUT OF BOUNDS - whitespace or word-breaking, incl. numbers, punctuation, latin punctuation, dashes, spaces, no thinspace, quotes. It is likely that I have left something out of this list. 8/21/03 jw UniClass [OOB] = ( U+0009 U+000A U+000D U+0020 U+00A0 U+0020 .. U+0040 U+05F3 U+05F4 \ U+05BE U+05C0 U+05C3 U+005B .. U+0060 \ U+007B .. U+007E U+00A7 U+00AB U+00AF \ U+00B6 U+00BB U+00BF U+05BE U+05C0 U+05C3 U+2000 .. U+200A U+2010 .. U+2021 \ U+2039 U+203A );deleted 25CC. JK says ignore 25CCs inserted by Uniscribe. 8/25/03 ; added Hebrew punctuation 8/25/03 ;A dotted circle followed by a pre-positive accent is certainly legal in itself. So I don't want the ;rearrangement to run on a typed dotted circle. It is removed from list above. ;NOTE pp_se MUST be used with WhiteSpace/OOB environments. Otherwise, you will have marks identified ;as word-initial which are not. ; Added shindots to all logic 10/6/03 jw ; added ZWJ (U+200D) cjs April 2006 ; added CGJ (U+034F) and ZWNJ (U+200C) cjs Feb 2007 [pp]=a [cons]=b [shindots]=j? [dr]{0,2}=c U+05BD?=l U+034F?=m [vow]{0,2}=d (U+200C | U+200D)?=k [high_p]{0,2}=i [low_p]{0,2}=e U+05C5?=h [high_p]{0,2}=f [low_p]{0,2}=g / [OOB] _ > @b @j @c @l @m @d @k @h @e @g @a @i @f [pp]=a [cons]=b [shindots]=j? [dr]{0,2}=c U+05BD?=l U+034F?=m [vow]{0,2}=d (U+200C | U+200D)?=k [high_p]{0,2}=i [low_p]{0,2}=e U+05C5?=h [high_p]{0,2}=f [low_p]{0,2}=g / # _ > @b @j @c @l @m @d @k @h @e @g @a @i @f ;normal rearrangement - no pre-positve, no second vowel [cons]=b [shindots]=j? [dr]{0,2}=c [vow]{0,2}=d [high_p]{0,2}=i [low_p]{0,2}=e U+05C5?=h [high_p]{0,2}=f [low_p]{0,2}=g > @b @j @c @d @h @e @g @i @f ;rare 2nd vowel rearrangement, doesn't add CGJ-034F before 2nd vowel because that interferes with positioning in Word ;although is helpful for preventing canonical reordering. ;this is now recommended in Unicode 4.1, so it was added here cjs April 2006 [cons]=b [shindots]=k? [dr]{0,2}=c [vow]{1,2}=d [high_p]{0,2}=i [low_p]{0,2}=e U+05C5?=h [high_p]{0,2}=f [low_p]{0,2}=g [vow]=j > @b @k @c [vow]{1,2}=d @h @e @g U+034F @j @i @f ;NOTEPAD is more reliable. WORD 2002 is not reliably displaying sequences of Latin marks RTL. We are avoiding this problem ;by not encoding 2 Latin marks in a row. ;**************** RETURN TRIP HERE **************************************** ;2nd vowel ; @b @k @c @d @e @h @f @j < [cons]=b [shindots]=k? [dr]{0,2}=c [vow]{1,2}=d [low_p]{0,2}=e U+05C5?=h U+034F [vow]=j [high_p]{0,2}=f ; changed to fix doubling of hireq in Jerusalem on return trip cjs May 2006 ; return trip is also needed for back conversion of older text lacking CGJ - note the ? with U+034F @b @k @c [vow]{1,2}=d @e @h @f @j < [cons]=b [shindots]=k? [dr]{0,2}=c [vow]{1,2}=d [low_p]{0,2}=e U+05C5?=h U+034F? [vow]=j [high_p]{0,2}=f ;regular consonant stays same @b @j @c @d @e @h @f < [cons]=b [shindots]=j? [dr]{0,2}=c [vow]{0,2}=d [low_p]{0,2}=e U+05C5?=h [high_p]{0,2}=f ; prepositives ;cjs added ZWJ = 200D May 2006 and CGJ (U+034F) and ZWNJ (U+200C) cjs Feb 2007 @a @b @j @c @l @m @d @k @e @h @f < [cons]=b [shindots]=j? [dr]{0,2}=c U+05BD?=l U+034F?=m [vow]{0,2}=d (U+200C | U+200D)?=k [low_p]{0,2}=e U+05C5?=h [pp]=a [high_p]{0,2}=f / # _ @a @b @j @c @l @m @d @k @e @h @f < [cons]=b [shindots]=j? [dr]{0,2}=c U+05BD?=l U+034F?=m [vow]{0,2}=d (U+200C | U+200D)?=k [low_p]{0,2}=e U+05C5?=h [pp]=a [high_p]{0,2}=f / [OOB] _ ;postpositives are untouched because they are always last in both encodings. ; *********** ; extra pass added to insert CGJ, ZWJ, ZWNJ into vowel-meteg or similar sequences ; also corrects miscellaneous problems pass (Unicode) Class [WS] = ( U+0009 U+000A U+000D U+0020 U+00A0 ) Class [lo_vow] = ( U+05B0 U+05B4 U+05B5 U+05B6 U+05B7 U+05B8 U+05BB U+05C7 ) Class [hataf] = ( U+05B1 .. U+05B3 ) Class [lo_cant] = ( U+0591 U+0596 U+059A U+059B U+05A2 U+05A3 U+05A4 U+05A5 U+05A6 U+05A7 \ U+05AA U+05AD U+05C5 ) Class [hi_cant] = ( U+0592 U+0593 U+0594 U+0595 U+0597 U+0598 U+0599 U+059C U+059D U+059E U+059F \ U+05A0 U+05A1 U+05A8 U+05A9 U+05AB U+05AC U+05AE U+05AF U+05C4 ) ;this is the meteg stuff ;[lo_vow] U+05BD - canonical order so unchanged U+05BD = meteg U+05BD [hi_cant]?=b [lo_vow]=a <> U+05BD U+034F [lo_vow]=a [hi_cant]?=b ;CGJ U+05BD [lo_cant]?=b [lo_vow]=a <> U+05BD [lo_cant]?=b U+034F [lo_vow]=a ;CGJ ;medial meteg with hataf needs ZWJ - dealt with above - see 102 and 254 ;however the ZWJ has to be removed for the back conversion to work properly with pre-pos cants ;even though this loses the left/medial distinction in the return trip [hataf] U+05BD < [hataf] U+200D U+05BD ;ZWJ medial [hataf] U+05BD <> [hataf] U+200C U+05BD ;ZWNJ left U+05BD [hataf]=a <> U+05BD U+034F [hataf]=a ;CGJ right ;when metheg (silluq) is to the left of another low cant, it needs a CGJ to hold it on normalisation U+05BD / [lo_cant] _ <> U+034F U+05BD ; miscellaneous problems ; 1. holem-waw with lamed and azla/qadma cant ; the next line is not needed on the return trip and would give a mismatch at Dan 6.5.4 U+05B9 U+05A8 U+05D5 / U+05DC U+05BC? _ > U+05A8 U+05D5 U+05B9 ; 2. holam as a second vowel is rare (only 2Ki 21.26.1) but must occur first and needs the CGJ to prevent reordering [lo_vow]=a U+034F U+05B9=e > U+05B9=e U+034F [lo_vow]=a [lo_vow]=a U+05B9=e < U+05B9=e U+034F [lo_vow]=a ; 3. special correction for the unique Job 6.10.1 with no reversal U+05A5 U+05D5 U+05BC / ( [WS] | # ) _ > U+05D5 U+05BC U+05A5