diff --git a/server/plugins/messageStorage/sqlite.ts b/server/plugins/messageStorage/sqlite.ts index 713f108f..e39323b2 100644 --- a/server/plugins/messageStorage/sqlite.ts +++ b/server/plugins/messageStorage/sqlite.ts @@ -26,7 +26,7 @@ try { type Migration = {version: number; stmts: string[]}; type Rollback = {version: number; rollback_forbidden?: boolean; stmts: string[]}; -export const currentSchemaVersion = 1703322560448; // use `new Date().getTime()` +export const currentSchemaVersion = 1706445634804; // use `new Date().getTime()` // Desired schema, adapt to the newest version and add migrations to the array below const schema = [ @@ -46,6 +46,35 @@ const schema = [ "CREATE INDEX network_channel ON messages (network, channel)", "CREATE INDEX time ON messages (time)", "CREATE INDEX msg_type_idx on messages (type)", // needed for efficient storageCleaner queries + `CREATE VIEW msg_search_view (id, type, text) AS + select + id, + type, + msg ->> '$.text' + from messages + where type in ('message', 'action', 'notice')`, + `CREATE VIRTUAL TABLE msg_fts USING fts5( + text, + tokenize='porter unicode61 remove_diacritics 2', + content='msg_search_view', + content_rowid=id + )`, + `CREATE TRIGGER msg_fts_insert AFTER INSERT ON messages + WHEN new.type in ('message', 'action', 'notice') + BEGIN + INSERT INTO msg_fts(rowid, text) VALUES (new.id, new.msg ->> '$.text'); + END`, + `CREATE TRIGGER msg_fts_delete AFTER DELETE ON messages + WHEN old.type in ('message', 'action', 'notice') + BEGIN + INSERT INTO msg_fts(msg_fts, rowid, text) VALUES ('delete', old.id, old.msg ->> '$.text'); + END`, + `CREATE TRIGGER msg_fts_update AFTER UPDATE ON messages + WHEN old.type in ('message', 'action', 'notice') + BEGIN + INSERT INTO msg_fts(msg_fts, rowid, text) VALUES ('delete', old.id, old.msg ->> '$.text'); + INSERT INTO msg_fts(rowid, text) VALUES (new.id, new.msg ->> '$.text'); + END`, ]; // the migrations will be executed in an exclusive transaction as a whole @@ -83,6 +112,42 @@ export const migrations: Migration[] = [ version: 1703322560448, stmts: ["CREATE INDEX msg_type_idx on messages (type)"], }, + { + version: 1706445634804, + stmts: [ + `CREATE VIEW msg_search_view (id, type, text) AS + select + id, + type, + msg ->> '$.text' + from messages + where type in ('message', 'action', 'notice')`, + `CREATE VIRTUAL TABLE msg_fts USING fts5( + text, + tokenize='porter unicode61 remove_diacritics 2', + content='msg_search_view', + content_rowid=id + )`, + `CREATE TRIGGER msg_fts_insert AFTER INSERT ON messages + WHEN new.type in ('message', 'action', 'notice') + BEGIN + INSERT INTO msg_fts(rowid, text) VALUES (new.id, new.msg ->> '$.text'); + END`, + `CREATE TRIGGER msg_fts_delete AFTER DELETE ON messages + WHEN old.type in ('message', 'action', 'notice') + BEGIN + INSERT INTO msg_fts(msg_fts, rowid, text) VALUES ('delete', old.id, old.msg ->> '$.text'); + END`, + `CREATE TRIGGER msg_fts_update AFTER UPDATE ON messages + WHEN old.type in ('message', 'action', 'notice') + BEGIN + INSERT INTO msg_fts(msg_fts, rowid, text) VALUES ('delete', old.id, old.msg ->> '$.text'); + INSERT INTO msg_fts(rowid, text) VALUES (new.id, new.msg ->> '$.text'); + END`, + `INSERT into msg_fts (rowid, text) SELECT id, text FROM msg_search_view + WHERE type in ('message', 'action', 'notice');`, + ], + }, ]; // down migrations need to restore the state of the prior version. @@ -100,6 +165,16 @@ export const rollbacks: Rollback[] = [ version: 1703322560448, stmts: ["drop INDEX msg_type_idx"], }, + { + version: 1706445634804, + stmts: [ + "drop table msg_fts", + "drop view msg_search_view", + "drop trigger msg_fts_insert", + "drop trigger msg_fts_update", + "drop trigger msg_fts_delete", + ], + }, ]; class Deferred { @@ -465,30 +540,24 @@ class SqliteMessageStorage implements SearchableMessageStorage { ); } - // Using the '@' character to escape '%' and '_' in patterns. - const escapedSearchTerm = query.searchTerm.replace(/([%_@])/g, "@$1"); + const rows = await this.serialize_fetchall( + `SELECT msg, type, time, network, channel + FROM messages + WHERE network = ? + AND channel = ? + AND messages.id in ( + SELECT rowid FROM msg_fts WHERE msg_fts MATCH ? + ) + ORDER BY time DESC + LIMIT ? + OFFSET ?`, + query.networkUuid, + query.channelName.toLowerCase(), + fts5_escape(query.searchTerm), + 100, // limit + query.offset + ); - let select = - 'SELECT msg, type, time, network, channel FROM messages WHERE type = "message" AND json_extract(msg, "$.text") LIKE ? ESCAPE \'@\''; - const params: any[] = [`%${escapedSearchTerm}%`]; - - if (query.networkUuid) { - select += " AND network = ? "; - params.push(query.networkUuid); - } - - if (query.channelName) { - select += " AND channel = ? "; - params.push(query.channelName.toLowerCase()); - } - - const maxResults = 100; - - select += " ORDER BY time DESC LIMIT ? OFFSET ? "; - params.push(maxResults); - params.push(query.offset); - - const rows = await this.serialize_fetchall(select, ...params); return { ...query, results: parseSearchRowsToMessages(query.offset, rows).reverse(), @@ -590,6 +659,11 @@ function parseSearchRowsToMessages(id: number, rows: any[]) { return messages; } +function fts5_escape(s: string): string { + s = s.replaceAll('"', '""'); // doubled quotes escape the quote + return `"${s}"`; // this makes it a string, rather than hoping it still fits the bareword +} + export function necessaryMigrations(since: number): Migration[] { return migrations.filter((m) => m.version > since); } diff --git a/test/plugins/sqlite.ts b/test/plugins/sqlite.ts index e2af20be..ab2f796a 100644 --- a/test/plugins/sqlite.ts +++ b/test/plugins/sqlite.ts @@ -382,36 +382,47 @@ describe("SQLite Message Storage", function () { }); it("should search messages", async function () { - const originalMaxHistory = Config.values.maxHistory; - - try { - Config.values.maxHistory = 2; - - const search = await store.search({ - searchTerm: "msg", - networkUuid: "retrieval-order-test-network", - channelName: "", - offset: 0, - }); - expect(search.results).to.have.lengthOf(100); - const expectedMessages: string[] = []; - - for (let i = 100; i < 200; ++i) { - expectedMessages.push(`msg ${i}`); - } - - expect(search.results.map((i_1) => i_1.text)).to.deep.equal(expectedMessages); - } finally { - Config.values.maxHistory = originalMaxHistory; + for (let i = 0; i < 101; ++i) { + await store.index( + {uuid: "searchNet"} as any, + {name: "#channel"} as any, + new Msg({ + time: 123456789 + i, + text: `msg ${i}`, + } as any) + ); + await store.index( + {uuid: "searchNet"} as any, + {name: "#channel"} as any, + new Msg({ + time: 123456789 + i, + text: `no match ${i}`, + } as any) + ); } + + const search = await store.search({ + searchTerm: "msg", + networkUuid: "searchNet", + channelName: "#channel", + offset: 0, + }); + expect(search.results).to.have.lengthOf(100); + const expectedMessages: string[] = []; + + for (let i = 1; i < 101; ++i) { + expectedMessages.push(`msg ${i}`); + } + + expect(search.results.map((i_1) => i_1.text)).to.deep.equal(expectedMessages); }); - it("should search messages with escaped wildcards", async function () { + it("should search messages when symbols are given", async function () { async function assertResults(query: string, expected: string[]) { const search = await store.search({ searchTerm: query, networkUuid: "this-is-a-network-guid2", - channelName: "", + channelName: "#channel", offset: 0, }); expect(search.results.map((i) => i.text)).to.deep.equal(expected); @@ -422,12 +433,16 @@ describe("SQLite Message Storage", function () { try { Config.values.maxHistory = 3; + const foo_bar_baz = `foo % bar _ baz`; + const bar_baz = `bar @ " baz`; + const foo_bar_x_baz = `👻 foo bar x baz`; + await store.index( {uuid: "this-is-a-network-guid2"} as any, {name: "#channel"} as any, new Msg({ time: 123456790, - text: `foo % bar _ baz`, + text: foo_bar_baz, } as any) ); @@ -436,7 +451,7 @@ describe("SQLite Message Storage", function () { {name: "#channel"} as any, new Msg({ time: 123456791, - text: `foo bar x baz`, + text: foo_bar_x_baz, } as any) ); @@ -445,18 +460,23 @@ describe("SQLite Message Storage", function () { {name: "#channel"} as any, new Msg({ time: 123456792, - text: `bar @ baz`, + text: bar_baz, } as any) ); - await assertResults("foo", ["foo % bar _ baz", "foo bar x baz"]); - await assertResults("%", ["foo % bar _ baz"]); - await assertResults("foo % bar ", ["foo % bar _ baz"]); - await assertResults("_", ["foo % bar _ baz"]); - await assertResults("bar _ baz", ["foo % bar _ baz"]); + await assertResults("foo", [foo_bar_baz, foo_bar_x_baz]); + await assertResults("foo % bar ", [foo_bar_baz, foo_bar_x_baz]); + await assertResults("bar _ baz", [foo_bar_baz, bar_baz]); + await assertResults("👻 foo", [foo_bar_baz, foo_bar_x_baz]); + // Our tokenizer doesn't care at all about non text, this just serves as documentation + // as to what one can expect from it and to check that we can't crash the search with "funny" symbols + await assertResults("%", []); + await assertResults("_", []); await assertResults("%%", []); await assertResults("@%", []); - await assertResults("@", ["bar @ baz"]); + await assertResults("@ '", []); + await assertResults('"', []); + await assertResults('"👻', []); } finally { Config.values.maxHistory = originalMaxHistory; }