merge: allow more non-ascii in markup (#4)

2024-11-21 21:55:09 +00:00 · 2023-12-28 12:54:56 +01:00 · 2023-12-28 12:54:56 +01:00 · 3ab3ca1cfb
commit 3ab3ca1cfb
parent 3f5388227f 8818a2df9a
2 changed files with 48 additions and 3 deletions
--- a/src/internal/parser.ts
+++ b/src/internal/parser.ts
@ -12,8 +12,8 @@ import twemojiRegex from '@twemoji/parser/dist/lib/regex';
 type ArgPair = { k: string, v: string | true };
 type Args = Record<string, string | true>;

-const space = P.regexp(/[\u0020\u3000\t]/);
-const alphaAndNum = P.regexp(/[a-z0-9]/i);
+const space = P.regexp(/[\s--[\n\r]]/v);
+const alphaAndNum = P.regexp(/\p{Letter}|\p{Number}/iu);
 const newLine = P.alt([P.crlf, P.cr, P.lf]);

 function seqOrText(parsers: P.Parser<any>[]): P.Parser<any[] | string> {
@ -579,7 +579,7 @@ export const language = P.createLanguage({
 	hashtag: r => {
 		const mark = P.str('#');
 		const hashTagChar = P.seq([
-			P.notMatch(P.alt([P.regexp(/[ \u3000\t., \u2063\t.,!?'"#:/[\]【】()「」（）<>]/), space, newLine])),
+			P.notMatch(P.regexp(/[\s.,\u2063!?'"#:/[\]【】()「」（）<>]/u)),
 			P.char,
 		], 1);
 		const innerItem: P.Parser<any> = P.lazy(() => P.alt([
--- a/test/parser.ts
+++ b/test/parser.ts
@ -565,6 +565,16 @@ hoge`;
 			assert.deepStrictEqual(mfm.parse(input), output);
 		});

+		test('basic non-ascii', () => {
+			const input = '*aßc*';
+			const output = [
+				ITALIC([
+					TEXT('aßc')
+				])
+			];
+			assert.deepStrictEqual(mfm.parse(input), output);
+		});
+
 		test('ignore a italic syntax if the before char is neither a space nor an LF nor [^a-z0-9]i', () => {
 			let input = 'before*abc*after';
 			let output: mfm.MfmNode[] = [TEXT('before*abc*after')];
@ -605,6 +615,16 @@ hoge`;
 			assert.deepStrictEqual(mfm.parse(input), output);
 		});

+		test('basic non-ascii', () => {
+			const input = '_abç_';
+			const output = [
+				ITALIC([
+					TEXT('abç')
+				])
+			];
+			assert.deepStrictEqual(mfm.parse(input), output);
+		});
+
 		test('ignore a italic syntax if the before char is neither a space nor an LF nor [^a-z0-9]i', () => {
 			let input = 'before_abc_after';
 			let output: mfm.MfmNode[] = [TEXT('before_abc_after')];
@ -640,6 +660,14 @@ hoge`;
 			])];
 			assert.deepStrictEqual(mfm.parse(input), output);
 		});
+
+		test('basic non-ascii', () => {
+			const input = '~~föo~~';
+			const output = [STRIKE([
+				TEXT('föo')
+			])];
+			assert.deepStrictEqual(mfm.parse(input), output);
+		});
 	});

 	describe('inlineCode', () => {
@ -781,6 +809,23 @@ hoge`;
 			assert.deepStrictEqual(mfm.parse(input), output);
 		});

+		test('basic non-ascii', () => {
+			const input = '#äbc';
+			const output = [HASHTAG('äbc')];
+			assert.deepStrictEqual(mfm.parse(input), output);
+		});
+
+		test('newlines and whitespace', () => {
+			const input = 'before #abc\nafter #def\u3000foo #ghi\tbar #jkl';
+			const output = [
+				TEXT('before '), HASHTAG('abc'),
+				TEXT('\nafter '), HASHTAG('def'),
+				TEXT('\u3000foo '), HASHTAG('ghi'),
+				TEXT('\tbar '), HASHTAG('jkl'),
+			];
+			assert.deepStrictEqual(mfm.parse(input), output);
+		});
+
 		test('with keycap number sign', () => {
 			const input = '#️⃣abc123 #abc';
 			const output = [UNI_EMOJI('#️⃣'), TEXT('abc123 '), HASHTAG('abc')];