Es gab so viele Änderungen, die ich an meiner ersten Antwort vornehmen musste. Ich beginne diese !!!
USE test
DROP TABLE IF EXISTS ngram_key;
DROP TABLE IF EXISTS ngram_rec;
DROP TABLE IF EXISTS ngram_blk;
CREATE TABLE ngram_key
(
NGRAM_ID UNSIGNED BIGINT NOT NULL AUTO_INCREMENT,
NGRAM VARCHAR(64) NOT NULL,
PRIMARY KEY (NGRAM),
KEY (NGRAM_ID)
) ENGINE=MyISAM ROW_FORMAT=FIXED PARTITION BY KEY(NGRAM) PARTITIONS 256;
CREATE TABLE ngram_rec
(
NGRAM_ID UNSIGNED BIGINT NOT NULL,
YR SMALLINT NOT NULL,
MC SMALLINT NOT NULL,
PC SMALLINT NOT NULL,
VC SMALLINT NOT NULL,
PRIMARY KEY (NGRAM_ID,YR)
) ENGINE=MyISAM ROW_FORMAT=FIXED;
CREATE TABLE ngram_blk
(
NGRAM VARCHAR(64) NOT NULL,
YR SMALLINT NOT NULL,
MC SMALLINT NOT NULL,
PC SMALLINT NOT NULL,
VC SMALLINT NOT NULL
) ENGINE=BLACKHOLE;
DELIMITER $$
CREATE TRIGGER populate_ngram AFTER INSERT ON ngram_blk FOR EACH ROW
BEGIN
DECLARE NEW_ID BIGINT;
INSERT IGNORE INTO ngram_key (NGRAM) VALUES (NEW.NGRAM);
SELECT NGRAM_ID INTO NEW_ID FROM ngram_key WHERE NGRAM=NEW.NGRAM;
INSERT IGNORE INTO ngram_rec VALUES (NEW_ID,NEW.YR,NEW.MC,NEW.PC,NEW.VC);
END; $$
DELIMITER ;
INSERT INTO ngram_blk VALUES
('rolando',1965,31,29,85),
('pamela',1971,33,21,86),
('dominique',1996,30,18,87),
('diamond',1998,13,28,88),
('rolando edwards',1965,31,29,85),
('pamela edwards',1971,33,21,86),
('dominique edwards',1996,30,18,87),
('diamond edwards',1998,13,28,88),
('rolando angel edwards',1965,31,29,85),
('pamela claricia edwards',1971,33,21,86),
('dominique sharlisee edwards',1996,30,18,87),
('diamond ashley edwards',1998,13,28,88);
UPDATE ngram_rec SET yr=yr+1,mc=mc+30,pc=pc+30,vc=vc+30;
INSERT INTO ngram_blk VALUES
('rolando',1965,31,29,85),
('pamela',1971,33,21,86),
('dominique',1996,30,18,87),
('diamond',1998,13,28,88),
('rolando edwards',1965,31,29,85),
('pamela edwards',1971,33,21,86),
('dominique edwards',1996,30,18,87),
('diamond edwards',1998,13,28,88),
('rolando angel edwards',1965,31,29,85),
('pamela claricia edwards',1971,33,21,86),
('dominique sharlisee edwards',1996,30,18,87),
('diamond ashley edwards',1998,13,28,88);
UPDATE ngram_rec SET yr=yr+1,mc=mc+30,pc=pc+30;
INSERT INTO ngram_blk VALUES
('rolando',1965,31,29,85),
('pamela',1971,33,21,86),
('dominique',1996,30,18,87),
('diamond',1998,13,28,88),
('rolando edwards',1965,31,29,85),
('pamela edwards',1971,33,21,86),
('dominique edwards',1996,30,18,87),
('diamond edwards',1998,13,28,88),
('rolando angel edwards',1965,31,29,85),
('pamela claricia edwards',1971,33,21,86),
('dominique sharlisee edwards',1996,30,18,87),
('diamond ashley edwards',1998,13,28,88);
UPDATE ngram_rec SET yr=yr+1,mc=mc+30;
SELECT * FROM ngram_key;
SELECT * FROM ngram_rec;
SELECT A.ngram NGram,B.yr Year,B.mc Matches,B.pc Pages,B.vc Volumes FROM
ngram_key A,ngram_rec B
WHERE A.ngram='rolando angel edwards'
AND A.ngram_id=B.ngram_id;
Viel kleinere Tabellen für Jahresinformationen, aber viel größere Schlüssel, um das ursprüngliche Ngramm zu erhalten. Ich habe auch die Menge der Testdaten erhöht. Sie können dies ausschneiden und direkt in MySQL einfügen.
VORBEHALT
Entfernen Sie einfach ROW_FORMAT und es wird dymanisch und komprimieren Sie die ngram_key-Tabellen viel kleiner.
DiskSpace-Metriken
nrgram_rec hat 17 Bytes pro Zeile
8 Bytes für ngram_id (maximaler Wert ohne Vorzeichen 18446744073709551615 [2 ^ 64 - 1])
8 Bytes für 4 Smallints (jeweils 2 Bytes)
1 Byte internes MyISAM- Löschflag
Indexeintrag für ngram_rec = 10 Bytes (8 (ngram_id) + 2 (Jahr))
47 Millionen Zeilen x 17 Bytes pro Zeile = 0799 Millionen Bytes = 761,98577 MB
47 Millionen Zeilen x 12 Bytes pro Zeile = 0564 Millionen Bytes = 537,85231 MB
47 Millionen Zeilen x 29 Bytes pro Zeile = 1363 Millionen Bytes = 1,269393 GB
5 Milliarden Zeilen x 17 Bytes pro Zeile = 085 Milliarden Bytes = 079,1624 GB
5 Milliarden Zeilen x 12 Bytes pro Zeile = 060 Milliarden Bytes = 055,8793 GB
5 Milliarden Zeilen x 29 Bytes pro Zeile = 145 Milliarden Bytes = 135,0417 GB
ngram_key hat 73 Bytes 64 Bytes für ngram (ROW_FORMAT = FIXED setze varchar auf char) 8 Bytes für ngram_id 1 Byte internes MyISAM-Löschflag
2 Indexeinträge für ngram_key = 64 Bytes + 8 Bytes = 72 Bytes
47 Millionen Zeilen x 073 Bytes pro Zeile = 3431 Millionen Bytes = 3,1954 GB
47 Millionen Zeilen x 072 Bytes pro Zeile = 3384 Millionen Bytes = 3,1515 GB
47 Millionen Zeilen x 145 Bytes pro Zeile = 6815 Millionen Bytes = 6,3469 GB
5 Milliarden Zeilen x 073 Bytes pro Zeile = 365 Milliarden Bytes = 339,9327 GB
5 Milliarden Zeilen x 072 Bytes pro Zeile = 360 Milliarden Bytes = 335,2761 GB
5 Milliarden Zeilen x 145 Bytes pro Zeile = 725 Milliarden Bytes = 675,2088 GB