config

Personal configuration.
git clone git://code.dwrz.net/config
Log | Files | Refs

gptel-gemini.el (13221B)


      1 ;;; gptel-gemini.el ---  Gemini suppport for gptel  -*- lexical-binding: t; -*-
      2 
      3 ;; Copyright (C) 2023  Karthik Chikmagalur
      4 
      5 ;; Author: Karthik Chikmagalur <karthikchikmagalur@gmail.com>
      6 
      7 ;; This program is free software; you can redistribute it and/or modify
      8 ;; it under the terms of the GNU General Public License as published by
      9 ;; the Free Software Foundation, either version 3 of the License, or
     10 ;; (at your option) any later version.
     11 
     12 ;; This program is distributed in the hope that it will be useful,
     13 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15 ;; GNU General Public License for more details.
     16 
     17 ;; You should have received a copy of the GNU General Public License
     18 ;; along with this program.  If not, see <https://www.gnu.org/licenses/>.
     19 
     20 ;;; Commentary:
     21 
     22 ;; This file adds support for the Gemini API to gptel
     23 
     24 ;;; Code:
     25 (require 'gptel)
     26 (require 'cl-generic)
     27 (require 'map)
     28 (eval-when-compile (require 'cl-lib))
     29 
     30 (declare-function prop-match-value "text-property-search")
     31 (declare-function text-property-search-backward "text-property-search")
     32 (declare-function json-read "json")
     33 (declare-function gptel-context--wrap "gptel-context")
     34 (declare-function gptel-context--collect-media "gptel-context")
     35 (defvar json-object-type)
     36 
     37 ;;; Gemini
     38 (cl-defstruct
     39     (gptel-gemini (:constructor gptel--make-gemini)
     40                   (:copier nil)
     41                   (:include gptel-backend)))
     42 
     43 (cl-defmethod gptel-curl--parse-stream ((_backend gptel-gemini) _info)
     44   (let* ((content-strs))
     45     (condition-case nil
     46         ;; while-let is Emacs 29.1+ only
     47         (while (prog1 (search-forward "{" nil t)
     48                  (backward-char 1))
     49           (save-match-data
     50             (when-let*
     51                 ((response (gptel--json-read))
     52                  (text (map-nested-elt
     53                         response '(:candidates 0 :content :parts 0 :text))))
     54               (push text content-strs))))
     55       (error
     56        (goto-char (match-beginning 0))))
     57     (apply #'concat (nreverse content-strs))))
     58 
     59 (cl-defmethod gptel--parse-response ((_backend gptel-gemini) response _info)
     60   (map-nested-elt response '(:candidates 0 :content :parts 0 :text)))
     61 
     62 (cl-defmethod gptel--request-data ((_backend gptel-gemini) prompts)
     63   "JSON encode PROMPTS for sending to Gemini."
     64   (let ((prompts-plist
     65          `(:contents [,@prompts]
     66            :safetySettings [(:category "HARM_CATEGORY_HARASSMENT"
     67                              :threshold "BLOCK_NONE")
     68                             (:category "HARM_CATEGORY_SEXUALLY_EXPLICIT"
     69                              :threshold "BLOCK_NONE")
     70                             (:category "HARM_CATEGORY_DANGEROUS_CONTENT"
     71                              :threshold "BLOCK_NONE")
     72                             (:category "HARM_CATEGORY_HATE_SPEECH"
     73                              :threshold "BLOCK_NONE")]))
     74         params)
     75     ;; HACK only gemini-pro doesn't support system messages.  Need a less hacky
     76     ;; way to do this.
     77     (if (and gptel--system-message
     78              (not (gptel--model-capable-p 'nosystem))
     79              (not (equal gptel-model 'gemini-pro)))
     80       (plist-put prompts-plist :system_instruction
     81                  `(:parts (:text ,gptel--system-message))))
     82     (when gptel-temperature
     83       (setq params
     84             (plist-put params
     85                        :temperature (max gptel-temperature 1.0))))
     86     (when gptel-max-tokens
     87       (setq params
     88             (plist-put params
     89                        :maxOutputTokens gptel-max-tokens)))
     90     (when params
     91       (plist-put prompts-plist
     92                  :generationConfig params))
     93     ;; Merge request params with model and backend params.
     94     (gptel--merge-plists
     95      prompts-plist
     96      (gptel-backend-request-params gptel-backend)
     97      (gptel--model-request-params  gptel-model))))
     98 
     99 (cl-defmethod gptel--parse-buffer ((_backend gptel-gemini) &optional max-entries)
    100   (let ((prompts) (prop)
    101         (include-media (and gptel-track-media (or (gptel--model-capable-p 'media)
    102                                                   (gptel--model-capable-p 'url)))))
    103     (if (or gptel-mode gptel-track-response)
    104         (while (and
    105                 (or (not max-entries) (>= max-entries 0))
    106                 (setq prop (text-property-search-backward
    107                             'gptel 'response
    108                             (when (get-char-property (max (point-min) (1- (point)))
    109                                                      'gptel)
    110                               t))))
    111           (if (prop-match-value prop)   ;assistant role
    112               (push (list :role "model"
    113                           :parts
    114                           (list :text (buffer-substring-no-properties (prop-match-beginning prop)
    115                                                                       (prop-match-end prop))))
    116                     prompts)
    117             (if include-media
    118                 (push (list :role "user"
    119                             :parts (gptel--gemini-parse-multipart
    120                                     (gptel--parse-media-links
    121                                      major-mode (prop-match-beginning prop) (prop-match-end prop))))
    122                       prompts)
    123               (push (list :role "user"
    124                           :parts
    125                           `[(:text ,(gptel--trim-prefixes
    126                                      (buffer-substring-no-properties (prop-match-beginning prop)
    127                                       (prop-match-end prop))))])
    128                     prompts)))
    129           (and max-entries (cl-decf max-entries)))
    130       (push (list :role "user"
    131                   :parts
    132                   `[(:text ,(string-trim (buffer-substring-no-properties (point-min) (point-max))))])
    133             prompts))
    134     ;; HACK Prepend the system message to the first user prompt, but only for
    135     ;; this model.
    136     (when (and (equal gptel-model 'gemini-pro)
    137                gptel--system-message)
    138       (cl-callf
    139           (lambda (msg)
    140             (vconcat `((:text ,(concat gptel--system-message "\n\n"))) msg))
    141           (thread-first (car prompts)
    142                         (plist-get :parts))))
    143     prompts))
    144 
    145 (defun gptel--gemini-parse-multipart (parts)
    146   "Convert a multipart prompt PARTS to the Gemini API format.
    147 
    148 The input is an alist of the form
    149  ((:text \"some text\")
    150   (:media \"/path/to/media.png\" :mime \"image/png\")
    151   (:text \"More text\")).
    152 
    153 The output is a vector of entries in a backend-appropriate
    154 format."
    155   (cl-loop
    156    for part in parts
    157    for n upfrom 1
    158    with last = (length parts)
    159    for text = (plist-get part :text)
    160    for media = (plist-get part :media)
    161    if text do
    162    (and (or (= n 1) (= n last)) (setq text (gptel--trim-prefixes text))) and
    163    unless (string-empty-p text)
    164    collect (list :text text) into parts-array end
    165    else if media
    166    collect
    167    `(:inline_data
    168      (:mime_type ,(plist-get part :mime)
    169       :data ,(gptel--base64-encode media)))
    170    into parts-array
    171    finally return (vconcat parts-array)))
    172 
    173 (cl-defmethod gptel--wrap-user-prompt ((_backend gptel-gemini) prompts
    174                                        &optional inject-media)
    175   "Wrap the last user prompt in PROMPTS with the context string.
    176 
    177 If INJECT-MEDIA is non-nil wrap it with base64-encoded media
    178 files in the context."
    179   (if inject-media
    180       ;; Wrap the first user prompt with included media files/contexts
    181       (when-let ((media-list (gptel-context--collect-media)))
    182         (cl-callf (lambda (current)
    183                     (vconcat (gptel--gemini-parse-multipart media-list)
    184                              current))
    185             (plist-get (car prompts) :parts)))
    186     ;; Wrap the last user prompt with included text contexts
    187     (cl-callf (lambda (current)
    188                 (if-let ((wrapped (gptel-context--wrap nil)))
    189                     (vconcat `((:text ,wrapped)) current)
    190                   current))
    191         (plist-get (car (last prompts)) :parts))))
    192 
    193 (defconst gptel--gemini-models
    194   '((gemini-pro
    195      :description "The previous generation of Google's multimodal AI model"
    196      :capabilities (tool json media)
    197      :mime-types ("image/png" "image/jpeg" "image/webp" "image/heic" "image/heif"
    198                   "application/pdf" "text/plain" "text/csv" "text/html")
    199      :context-window 32
    200      :input-cost 0.50
    201      :output-cost 1.50
    202      :cutoff-date "2023-02")
    203     (gemini-1.5-flash
    204      :description "A faster, more efficient version of Gemini 1.5 optimized for speed"
    205      :capabilities (tool json media)
    206      :mime-types ("image/png" "image/jpeg" "image/webp" "image/heic" "image/heif"
    207                   "application/pdf" "text/plain" "text/csv" "text/html")
    208      :context-window 1000
    209      ;; input & output price is halved for prompts of 128k tokens or less
    210      :input-cost 0.15
    211      :output-cost 0.60
    212      :cutoff-date "2024-05")
    213     (gemini-1.5-pro-latest
    214      :description "Google's latest model with enhanced capabilities across various tasks"
    215      :capabilities (tool json media)
    216      :mime-types ("image/png" "image/jpeg" "image/webp" "image/heic" "image/heif"
    217                   "application/pdf" "text/plain" "text/csv" "text/html")
    218      :context-window 2000
    219      ;; input & output price is halved for prompts of 128k tokens or less
    220      :input-cost 2.50
    221      :output-cost 10
    222      :cutoff-date "2024-05"))
    223   "List of available Gemini models and associated properties.
    224 Keys:
    225 
    226 - `:description': a brief description of the model.
    227 
    228 - `:capabilities': a list of capabilities supported by the model.
    229 
    230 - `:mime-types': a list of supported MIME types for media files.
    231 
    232 - `:context-window': the context window size, in thousands of tokens.
    233 
    234 - `:input-cost': the input cost, in US dollars per million tokens.
    235 
    236 - `:output-cost': the output cost, in US dollars per million tokens.
    237 
    238 - `:cutoff-date': the knowledge cutoff date.
    239 
    240 - `:request-params': a plist of additional request parameters to
    241   include when using this model.
    242 
    243 Information about the Gemini models was obtained from the following
    244 source:
    245 
    246 - <https://ai.google.dev/pricing>
    247 - <https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models>")
    248 
    249 ;;;###autoload
    250 (cl-defun gptel-make-gemini
    251     (name &key curl-args header key request-params
    252           (stream nil)
    253           (host "generativelanguage.googleapis.com")
    254           (protocol "https")
    255           (models gptel--gemini-models)
    256           (endpoint "/v1beta/models"))
    257 
    258   "Register a Gemini backend for gptel with NAME.
    259 
    260 Keyword arguments:
    261 
    262 CURL-ARGS (optional) is a list of additional Curl arguments.
    263 
    264 HOST (optional) is the API host, defaults to
    265 \"generativelanguage.googleapis.com\".
    266 
    267 MODELS is a list of available model names, as symbols.
    268 Additionally, you can specify supported LLM capabilities like
    269 vision or tool-use by appending a plist to the model with more
    270 information, in the form
    271 
    272  (model-name . plist)
    273 
    274 For a list of currently recognized plist keys, see
    275 `gptel--gemini-models'. An example of a model specification
    276 including both kinds of specs:
    277 
    278 :models
    279 \\='(gemini-pro                            ;Simple specs
    280   gemini-1.5-flash
    281   (gemini-1.5-pro-latest                ;Full spec
    282    :description
    283    \"Complex reasoning tasks, problem solving and data extraction\"
    284    :capabilities (tool json)
    285    :mime-types
    286    (\"image/jpeg\" \"image/png\" \"image/webp\" \"image/heic\")))
    287 
    288 
    289 STREAM is a boolean to enable streaming responses, defaults to
    290 false.
    291 
    292 PROTOCOL (optional) specifies the protocol, \"https\" by default.
    293 
    294 ENDPOINT (optional) is the API endpoint for completions, defaults to
    295 \"/v1beta/models\".
    296 
    297 HEADER (optional) is for additional headers to send with each
    298 request.  It should be an alist or a function that retuns an
    299 alist, like:
    300  ((\"Content-Type\" . \"application/json\"))
    301 
    302 KEY (optional) is a variable whose value is the API key, or
    303 function that returns the key.
    304 
    305 REQUEST-PARAMS (optional) is a plist of additional HTTP request
    306 parameters (as plist keys) and values supported by the API.  Use
    307 these to set parameters that gptel does not provide user options
    308 for."
    309   (declare (indent 1))
    310   (let ((backend (gptel--make-gemini
    311                   :curl-args curl-args
    312                   :name name
    313                   :host host
    314                   :header header
    315                   :models (gptel--process-models models)
    316                   :protocol protocol
    317                   :endpoint endpoint
    318                   :stream stream
    319                   :request-params request-params
    320                   :key key
    321                   :url (lambda ()
    322                          (let ((method (if (and stream
    323                                                 gptel-stream)
    324                                            "streamGenerateContent"
    325                                          "generateContent")))
    326                            (format "%s://%s%s/%s:%s?key=%s"
    327                                    protocol
    328                                    host
    329                                    endpoint
    330                                    gptel-model
    331                                    method
    332                                    (gptel--get-api-key)))))))
    333     (prog1 backend
    334       (setf (alist-get name gptel--known-backends
    335                        nil nil #'equal)
    336             backend))))
    337 
    338 (provide 'gptel-gemini)
    339 ;;; gptel-gemini.el ends here